parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor.
  20
  21 # The implementation is a pretty direct implementation of the parsing algorithm
  22 # described here:
  23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
  24 #
  25 # Deviations from that spec:
  26 #
  27 #   Purposeful: search this file for "WHATWG"
  28 #
  29 #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
  30
  31
  32 # stacks/lists
  33 #
  34 # the spec uses a many different words do indicate which ends of lists/stacks
  35 # they are talking about (and relative movement within the lists/stacks). This
  36 # section splains. I'm implementing "lists" (afe and open_els) the same way
  37 # (both as stacks)
  38 #
  39 # stacks grow downward (current element is index=0)
  40 #
  41 # example: open_els = [a, b, c, d, e, f, g]
  42 #
  43 # "grows downwards" means it's visualized like this: (index: el, names)
  44 #
  45 #   6: g "start of the list", "topmost", "first"
  46 #   5: f
  47 #   4: e "previous" (to d), "above", "before"
  48 #   3: d   (previous/next are relative to this element)
  49 #   2: c "next", "after", "lower", "below"
  50 #   1: b
  51 #   0: a "end of the list", "current node", "bottommost", "last"
  52
  53
  54 # browser
  55 # note: to get this to run outside a browser, you'll have to write a native
  56 # implementation of decode_named_char_ref()
  57 unless module?.exports?
  58         window.wheic = {}
  59         module = exports: window.wheic
  60
  61 from_code_point = (x) ->
  62         if String.fromCodePoint?
  63                 return String.fromCodePoint x
  64         else
  65                 if x <= 0xffff
  66                         return String.fromCharCode x
  67                 x -= 0x10000
  68                 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
  69
  70 # Each node is an obect of the Node class. Here are the Node types:
  71 TYPE_TAG = 0 # name, {attributes}, [children]
  72 TYPE_TEXT = 1 # "text"
  73 TYPE_COMMENT = 2
  74 TYPE_DOCTYPE = 3
  75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  77 TYPE_END_TAG = 5 # name
  78 TYPE_EOF = 6
  79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  81
  82 # namespace constants
  83 NS_HTML = 1
  84 NS_MATHML = 2
  85 NS_SVG = 3
  86
  87 # quirks mode constants
  88 QUIRKS_NO = 1
  89 QUIRKS_LIMITED = 2
  90 QUIRKS_YES = 3
  91
  92 g_debug_log = []
  93 debug_log_reset = ->
  94         g_debug_log = []
  95 debug_log = (str) ->
  96         g_debug_log.push str
  97 debug_log_each = (cb) ->
  98         for str in g_debug_log
  99                 cb str
 100
 101 prev_node_id = 0
 102 class Node
 103         constructor: (type, args = {}) ->
 104                 @type = type # one of the TYPE_* constants above
 105                 @name = args.name ? '' # tag name
 106                 @text = args.text ? '' # contents for text/comment nodes
 107                 @attrs = args.attrs ? {}
 108                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
 109                 @children = args.children ? []
 110                 @namespace = args.namespace ? NS_HTML
 111                 @parent = args.parent ? null
 112                 @token = args.token ? null
 113                 @flags = args.flags ? {}
 114                 if args.id?
 115                         @id = "#{args.id}+"
 116                 else
 117                         @id = "#{++prev_node_id}"
 118         acknowledge_self_closing: ->
 119                 if @token?
 120                         @token.flag 'did_self_close', true
 121                 else
 122                         @flag 'did_self_close', true
 123         flag: (key, value = null) ->
 124                 if value?
 125                         @flags[key] = value
 126                 else
 127                         return @flags[key]
 128         serialize: (shallow = false, show_ids = false) -> # for unit tests
 129                 ret = ''
 130                 switch @type
 131                         when TYPE_TAG
 132                                 ret += 'tag:'
 133                                 ret += JSON.stringify @name
 134                                 ret += ','
 135                                 if show_ids
 136                                         ret += "##{@id},"
 137                                 if shallow
 138                                         break
 139                                 attr_keys = []
 140                                 for k of @attrs
 141                                         attr_keys.push k
 142                                 attr_keys.sort()
 143                                 ret += '{'
 144                                 sep = ''
 145                                 for k in attr_keys
 146                                         ret += sep
 147                                         sep = ','
 148                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 149                                 ret += '},['
 150                                 sep = ''
 151                                 for c in @children
 152                                         ret += sep
 153                                         sep = ','
 154                                         ret += c.serialize shallow, show_ids
 155                                 ret += ']'
 156                         when TYPE_TEXT
 157                                 ret += 'text:'
 158                                 ret += JSON.stringify @text
 159                         when TYPE_COMMENT
 160                                 ret += 'comment:'
 161                                 ret += JSON.stringify @text
 162                         when TYPE_DOCTYPE
 163                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
 164                         when TYPE_AFE_MARKER
 165                                 ret += 'marker'
 166                         when TYPE_AAA_BOOKMARK
 167                                 ret += 'aaa_bookmark'
 168                         else
 169                                 ret += 'unknown:'
 170                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 171                 return ret
 172
 173 # helpers: (only take args that are normally known when parser creates nodes)
 174 new_open_tag = (name) ->
 175         return new Node TYPE_START_TAG, name: name
 176 new_end_tag = (name) ->
 177         return new Node TYPE_END_TAG, name: name
 178 new_element = (name) ->
 179         return new Node TYPE_TAG, name: name
 180 new_text_node = (txt) ->
 181         return new Node TYPE_TEXT, text: txt
 182 new_character_token = new_text_node
 183 new_comment_token = (txt) ->
 184         return new Node TYPE_COMMENT, text: txt
 185 new_doctype_token = (name) ->
 186         return new Node TYPE_DOCTYPE, name: name
 187 new_eof_token = ->
 188         return new Node TYPE_EOF
 189 new_afe_marker = ->
 190         return new Node TYPE_AFE_MARKER
 191 new_aaa_bookmark = ->
 192         return new Node TYPE_AAA_BOOKMARK
 193
 194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 196 digits = "0123456789"
 197 alnum = lc_alpha + uc_alpha + digits
 198 hex_chars = digits + "abcdefABCDEF"
 199
 200 is_uc_alpha = (str) ->
 201         return str.length is 1 and uc_alpha.indexOf(str) > -1
 202 is_lc_alpha = (str) ->
 203         return str.length is 1 and lc_alpha.indexOf(str) > -1
 204
 205 # some SVG elements have dashes in them
 206 tag_name_chars = alnum + "-"
 207
 208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 210 is_space = (txt) ->
 211         return txt.length is 1 and space_chars.indexOf(txt) > -1
 212 is_space_tok = (t) ->
 213         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 214
 215 is_input_hidden_tok = (t) ->
 216         return false unless t.type is TYPE_START_TAG
 217         for a in t.attrs_a
 218                 if a[0] is 'type'
 219                         if a[1].toLowerCase() is 'hidden'
 220                                 return true
 221                         return false
 222         return false
 223
 224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 226
 227 unicode_fixes = {}
 228 unicode_fixes[0x00] = "\uFFFD"
 229 unicode_fixes[0x80] = "\u20AC"
 230 unicode_fixes[0x82] = "\u201A"
 231 unicode_fixes[0x83] = "\u0192"
 232 unicode_fixes[0x84] = "\u201E"
 233 unicode_fixes[0x85] = "\u2026"
 234 unicode_fixes[0x86] = "\u2020"
 235 unicode_fixes[0x87] = "\u2021"
 236 unicode_fixes[0x88] = "\u02C6"
 237 unicode_fixes[0x89] = "\u2030"
 238 unicode_fixes[0x8A] = "\u0160"
 239 unicode_fixes[0x8B] = "\u2039"
 240 unicode_fixes[0x8C] = "\u0152"
 241 unicode_fixes[0x8E] = "\u017D"
 242 unicode_fixes[0x91] = "\u2018"
 243 unicode_fixes[0x92] = "\u2019"
 244 unicode_fixes[0x93] = "\u201C"
 245 unicode_fixes[0x94] = "\u201D"
 246 unicode_fixes[0x95] = "\u2022"
 247 unicode_fixes[0x96] = "\u2013"
 248 unicode_fixes[0x97] = "\u2014"
 249 unicode_fixes[0x98] = "\u02DC"
 250 unicode_fixes[0x99] = "\u2122"
 251 unicode_fixes[0x9A] = "\u0161"
 252 unicode_fixes[0x9B] = "\u203A"
 253 unicode_fixes[0x9C] = "\u0153"
 254 unicode_fixes[0x9E] = "\u017E"
 255 unicode_fixes[0x9F] = "\u0178"
 256
 257 quirks_yes_pi_prefixes = [
 258         "+//silmaril//dtd html pro v0r11 19970101//"
 259         "-//as//dtd html 3.0 aswedit + extensions//"
 260         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
 261         "-//ietf//dtd html 2.0 level 1//"
 262         "-//ietf//dtd html 2.0 level 2//"
 263         "-//ietf//dtd html 2.0 strict level 1//"
 264         "-//ietf//dtd html 2.0 strict level 2//"
 265         "-//ietf//dtd html 2.0 strict//"
 266         "-//ietf//dtd html 2.0//"
 267         "-//ietf//dtd html 2.1e//"
 268         "-//ietf//dtd html 3.0//"
 269         "-//ietf//dtd html 3.2 final//"
 270         "-//ietf//dtd html 3.2//"
 271         "-//ietf//dtd html 3//"
 272         "-//ietf//dtd html level 0//"
 273         "-//ietf//dtd html level 1//"
 274         "-//ietf//dtd html level 2//"
 275         "-//ietf//dtd html level 3//"
 276         "-//ietf//dtd html strict level 0//"
 277         "-//ietf//dtd html strict level 1//"
 278         "-//ietf//dtd html strict level 2//"
 279         "-//ietf//dtd html strict level 3//"
 280         "-//ietf//dtd html strict//"
 281         "-//ietf//dtd html//"
 282         "-//metrius//dtd metrius presentational//"
 283         "-//microsoft//dtd internet explorer 2.0 html strict//"
 284         "-//microsoft//dtd internet explorer 2.0 html//"
 285         "-//microsoft//dtd internet explorer 2.0 tables//"
 286         "-//microsoft//dtd internet explorer 3.0 html strict//"
 287         "-//microsoft//dtd internet explorer 3.0 html//"
 288         "-//microsoft//dtd internet explorer 3.0 tables//"
 289         "-//netscape comm. corp.//dtd html//"
 290         "-//netscape comm. corp.//dtd strict html//"
 291         "-//o'reilly and associates//dtd html 2.0//"
 292         "-//o'reilly and associates//dtd html extended 1.0//"
 293         "-//o'reilly and associates//dtd html extended relaxed 1.0//"
 294         "-//sq//dtd html 2.0 hotmetal + extensions//"
 295         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
 296         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
 297         "-//spyglass//dtd html 2.0 extended//"
 298         "-//sun microsystems corp.//dtd hotjava html//"
 299         "-//sun microsystems corp.//dtd hotjava strict html//"
 300         "-//w3c//dtd html 3 1995-03-24//"
 301         "-//w3c//dtd html 3.2 draft//"
 302         "-//w3c//dtd html 3.2 final//"
 303         "-//w3c//dtd html 3.2//"
 304         "-//w3c//dtd html 3.2s draft//"
 305         "-//w3c//dtd html 4.0 frameset//"
 306         "-//w3c//dtd html 4.0 transitional//"
 307         "-//w3c//dtd html experimental 19960712//"
 308         "-//w3c//dtd html experimental 970421//"
 309         "-//w3c//dtd w3 html//"
 310         "-//w3o//dtd w3 html 3.0//"
 311         "-//webtechs//dtd mozilla html 2.0//"
 312         "-//webtechs//dtd mozilla html//"
 313 ]
 314
 315 # These are the character references that don't need a terminating semicolon
 316 # min length: 2, max: 6, none are a prefix of any other.
 317 legacy_char_refs = {
 318         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 319         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 320         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 321         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 322         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 323         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 324         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 325         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 326         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 327         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 328         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 329         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 330         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 331         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 332         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 333         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 334         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 335         yen: '¥', yuml: 'ÿ'
 336 }
 337
 338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 339 raw_text_elements = ['script', 'style']
 340 escapable_raw_text_elements = ['textarea', 'title']
 341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 342 svg_elements = [
 343         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 344         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 345         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 346         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 347         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 348         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 349         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 350         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 351         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 352         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 353         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 354         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 355         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 356         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 357         'view', 'vkern'
 358 ]
 359
 360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 361 mathml_elements = [
 362         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 363         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 364         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 365         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 366         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 367         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 368         'determinant', 'diff', 'divergence', 'divide', 'domain',
 369         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 370         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 371         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 372         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 373         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 374         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 375         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 376         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 377         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 378         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 379         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 380         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 381         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 382         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 383         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 384         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 385         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 386         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 387         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 388         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 389         'vectorproduct', 'xor'
 390 ]
 391 # foreign_elements = [svg_elements..., mathml_elements...]
 392 #normal_elements = All other allowed HTML elements are normal elements.
 393
 394 special_elements = {
 395         # HTML:
 396         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 397         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 398         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 399         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 400         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 401         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 402         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 403         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 404         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 405         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 406         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
 407
 408         menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
 409
 410         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
 411         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
 412         plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
 413         select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
 414         table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
 415         textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
 416         tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
 417
 418         # MathML:
 419         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 420         'annotation-xml':NS_MATHML,
 421
 422         # SVG:
 423         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 424 }
 425
 426 formatting_elements = {
 427          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 428          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 429          u: true
 430 }
 431
 432 mathml_text_integration = {
 433         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
 434 }
 435 is_mathml_text_integration_point = (el) ->
 436         return mathml_text_integration[el.name] is el.namespace
 437 is_html_integration = (el) -> # DON'T PASS A TOKEN
 438         if el.namespace is NS_MATHML
 439                 if el.name is 'annotation-xml'
 440                         if el.attrs.encoding?
 441                                 if el.attrs.encoding.toLowerCase() is 'text/html'
 442                                         return true
 443                                 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
 444                                         return true
 445                 return false
 446         if el.namespace is NS_SVG
 447                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
 448                         return true
 449         return false
 450
 451 h_tags = {
 452         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
 453 }
 454
 455 foster_parenting_targets = {
 456         table: NS_HTML
 457         tbody: NS_HTML
 458         tfoot: NS_HTML
 459         thead: NS_HTML
 460         tr: NS_HTML
 461 }
 462
 463 end_tag_implied = {
 464         dd: NS_HTML
 465         dt: NS_HTML
 466         li: NS_HTML
 467         option: NS_HTML
 468         optgroup: NS_HTML
 469         p: NS_HTML
 470         rb: NS_HTML
 471         rp: NS_HTML
 472         rt: NS_HTML
 473         rtc: NS_HTML
 474 }
 475
 476 el_is_special = (e) ->
 477         return special_elements[e.name] is e.namespace
 478
 479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
 480 el_is_special_not_adp = (el) ->
 481         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
 482
 483 svg_name_fixes = {
 484         altglyph: 'altGlyph'
 485         altglyphdef: 'altGlyphDef'
 486         altglyphitem: 'altGlyphItem'
 487         animatecolor: 'animateColor'
 488         animatemotion: 'animateMotion'
 489         animatetransform: 'animateTransform'
 490         clippath: 'clipPath'
 491         feblend: 'feBlend'
 492         fecolormatrix: 'feColorMatrix'
 493         fecomponenttransfer: 'feComponentTransfer'
 494         fecomposite: 'feComposite'
 495         feconvolvematrix: 'feConvolveMatrix'
 496         fediffuselighting: 'feDiffuseLighting'
 497         fedisplacementmap: 'feDisplacementMap'
 498         fedistantlight: 'feDistantLight'
 499         fedropshadow: 'feDropShadow'
 500         feflood: 'feFlood'
 501         fefunca: 'feFuncA'
 502         fefuncb: 'feFuncB'
 503         fefuncg: 'feFuncG'
 504         fefuncr: 'feFuncR'
 505         fegaussianblur: 'feGaussianBlur'
 506         feimage: 'feImage'
 507         femerge: 'feMerge'
 508         femergenode: 'feMergeNode'
 509         femorphology: 'feMorphology'
 510         feoffset: 'feOffset'
 511         fepointlight: 'fePointLight'
 512         fespecularlighting: 'feSpecularLighting'
 513         fespotlight: 'feSpotLight'
 514         fetile: 'feTile'
 515         feturbulence: 'feTurbulence'
 516         foreignobject: 'foreignObject'
 517         glyphref: 'glyphRef'
 518         lineargradient: 'linearGradient'
 519         radialgradient: 'radialGradient'
 520         textpath: 'textPath'
 521 }
 522 svg_attribute_fixes = {
 523         attributename: 'attributeName'
 524         attributetype: 'attributeType'
 525         basefrequency: 'baseFrequency'
 526         baseprofile: 'baseProfile'
 527         calcmode: 'calcMode'
 528         clippathunits: 'clipPathUnits'
 529         contentscripttype: 'contentScriptType'
 530         contentstyletype: 'contentStyleType'
 531         diffuseconstant: 'diffuseConstant'
 532         edgemode: 'edgeMode'
 533         externalresourcesrequired: 'externalResourcesRequired'
 534         # WHATWG removes this: filterres: 'filterRes'
 535         filterunits: 'filterUnits'
 536         glyphref: 'glyphRef'
 537         gradienttransform: 'gradientTransform'
 538         gradientunits: 'gradientUnits'
 539         kernelmatrix: 'kernelMatrix'
 540         kernelunitlength: 'kernelUnitLength'
 541         keypoints: 'keyPoints'
 542         keysplines: 'keySplines'
 543         keytimes: 'keyTimes'
 544         lengthadjust: 'lengthAdjust'
 545         limitingconeangle: 'limitingConeAngle'
 546         markerheight: 'markerHeight'
 547         markerunits: 'markerUnits'
 548         markerwidth: 'markerWidth'
 549         maskcontentunits: 'maskContentUnits'
 550         maskunits: 'maskUnits'
 551         numoctaves: 'numOctaves'
 552         pathlength: 'pathLength'
 553         patterncontentunits: 'patternContentUnits'
 554         patterntransform: 'patternTransform'
 555         patternunits: 'patternUnits'
 556         pointsatx: 'pointsAtX'
 557         pointsaty: 'pointsAtY'
 558         pointsatz: 'pointsAtZ'
 559         preservealpha: 'preserveAlpha'
 560         preserveaspectratio: 'preserveAspectRatio'
 561         primitiveunits: 'primitiveUnits'
 562         refx: 'refX'
 563         refy: 'refY'
 564         repeatcount: 'repeatCount'
 565         repeatdur: 'repeatDur'
 566         requiredextensions: 'requiredExtensions'
 567         requiredfeatures: 'requiredFeatures'
 568         specularconstant: 'specularConstant'
 569         specularexponent: 'specularExponent'
 570         spreadmethod: 'spreadMethod'
 571         startoffset: 'startOffset'
 572         stddeviation: 'stdDeviation'
 573         stitchtiles: 'stitchTiles'
 574         surfacescale: 'surfaceScale'
 575         systemlanguage: 'systemLanguage'
 576         tablevalues: 'tableValues'
 577         targetx: 'targetX'
 578         targety: 'targetY'
 579         textlength: 'textLength'
 580         viewbox: 'viewBox'
 581         viewtarget: 'viewTarget'
 582         xchannelselector: 'xChannelSelector'
 583         ychannelselector: 'yChannelSelector'
 584         zoomandpan: 'zoomAndPan'
 585 }
 586 foreign_attr_fixes = {
 587         'xlink:actuate': 'xlink actuate'
 588         'xlink:arcrole': 'xlink arcrole'
 589         'xlink:href': 'xlink href'
 590         'xlink:role': 'xlink role'
 591         'xlink:show': 'xlink show'
 592         'xlink:title': 'xlink title'
 593         'xlink:type': 'xlink type'
 594         'xml:base': 'xml base'
 595         'xml:lang': 'xml lang'
 596         'xml:space': 'xml space'
 597         'xmlns': 'xmlns'
 598         'xmlns:xlink': 'xmlns xlink'
 599 }
 600 adjust_mathml_attributes = (t) ->
 601         for a in t.attrs_a
 602                 if a[0] is 'definitionurl'
 603                         a[0] = 'definitionURL'
 604         return
 605 adjust_svg_attributes = (t) ->
 606         for a in t.attrs_a
 607                 if svg_attribute_fixes[a[0]]?
 608                         a[0] = svg_attribute_fixes[a[0]]
 609         return
 610 adjust_foreign_attributes = (t) ->
 611         # fixfull
 612         for a in t.attrs_a
 613                 if foreign_attr_fixes[a[0]]?
 614                         a[0] = foreign_attr_fixes[a[0]]
 615         return
 616
 617 # decode_named_char_ref()
 618 #
 619 # The list of named character references is _huge_ so ask the browser to decode
 620 # for us instead of wasting bandwidth/space on including the table here.
 621 #
 622 # Pass without the "&" but with the ";" examples:
 623 #    for "&amp" pass "amp;"
 624 #    for "&#x2032" pass "x2032;"
 625 g_dncr = {
 626         cache: {}
 627         textarea: document.createElement('textarea')
 628 }
 629 # TODO test this in IE8
 630 decode_named_char_ref = (txt) ->
 631         txt = "&#{txt}"
 632         decoded = g_dncr.cache[txt]
 633         return decoded if decoded?
 634         g_dncr.textarea.innerHTML = txt
 635         decoded = g_dncr.textarea.value
 636         return null if decoded is txt
 637         return g_dncr.cache[txt] = decoded
 638
 639 parse_html = (args) ->
 640         txt = null
 641         cur = null # index of next char in txt to be parsed
 642         # declare doc and tokenizer variables so they're in scope below
 643         doc = null
 644         open_els = null # stack of open elements
 645         afe = null # active formatting elements
 646         template_ins_modes = null
 647         ins_mode = null
 648         original_ins_mode = null
 649         tok_state = null
 650         tok_cur_tag = null # partially parsed tag
 651         flag_scripting = null
 652         flag_frameset_ok = null
 653         flag_parsing = null
 654         flag_foster_parenting = null
 655         form_element_pointer = null
 656         temporary_buffer = null
 657         pending_table_character_tokens = null
 658         head_element_pointer = null
 659         flag_fragment_parsing = null
 660         context_element = null
 661
 662         stop_parsing = ->
 663                 flag_parsing = false
 664
 665         parse_error = ->
 666                 if args.error_cb?
 667                         args.error_cb cur
 668                 else
 669                         console.log "Parse error at character #{cur} of #{txt.length}"
 670
 671         # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
 672         # "Noah's Ark clause" but with three
 673         afe_push = (new_el) ->
 674                 matches = 0
 675                 for el, i in afe
 676                         if el.type is TYPE_AFE_MARKER
 677                                 break
 678                         if el.name is new_el.name and el.namespace is new_el.namespace
 679                                 attrs_match = true
 680                                 for k, v of el.attrs
 681                                         unless new_el.attrs[k] is v
 682                                                 attrs_match = false
 683                                                 break
 684                                 if attrs_match
 685                                         for k, v of new_el.attrs
 686                                                 unless el.attrs[k] is v
 687                                                         attrs_match = false
 688                                                         break
 689                                 if attrs_match
 690                                         matches += 1
 691                                         if matches is 3
 692                                                 afe.splice i, 1
 693                                                 break
 694                 afe.unshift new_el
 695         afe_push_marker = ->
 696                 afe.unshift new_afe_marker()
 697
 698         # the functions below impliment the Tree Contstruction algorithm
 699         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 700
 701         # But first... the helpers
 702         template_tag_is_open = ->
 703                 for el in open_els
 704                         if el.name is 'template' and el.namespace is NS_HTML
 705                                 return true
 706                 return false
 707         is_in_scope_x = (tag_name, scope, namespace) ->
 708                 for el in open_els
 709                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
 710                                 return true
 711                         if scope[el.name] is el.namespace
 712                                 return false
 713                 return false
 714         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 715                 for el in open_els
 716                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
 717                                 return true
 718                         if scope[el.name] is el.namespace
 719                                 return false
 720                         if scope2[el.name] is el.namespace
 721                                 return false
 722                 return false
 723         standard_scopers = {
 724                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 725                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 726                 template: NS_HTML,
 727
 728                 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
 729                 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
 730
 731                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 732         }
 733         button_scopers = button: NS_HTML
 734         li_scopers = ol: NS_HTML, ul: NS_HTML
 735         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 736         is_in_scope = (tag_name, namespace = null) ->
 737                 return is_in_scope_x tag_name, standard_scopers, namespace
 738         is_in_button_scope = (tag_name, namespace = null) ->
 739                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 740         is_in_table_scope = (tag_name, namespace = null) ->
 741                 return is_in_scope_x tag_name, table_scopers, namespace
 742         # aka is_in_list_item_scope
 743         is_in_li_scope = (tag_name, namespace = null) ->
 744                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
 745         is_in_select_scope = (tag_name, namespace = null) ->
 746                 for t in open_els
 747                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 748                                 return true
 749                         if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
 750                                 return false
 751                 return false
 752         # this checks for a particular element, not by name
 753         # this requires a namespace match
 754         el_is_in_scope = (needle) ->
 755                 for el in open_els
 756                         if el is needle
 757                                 return true
 758                         if standard_scopers[el.name] is el.namespace
 759                                 return false
 760                 return false
 761
 762         clear_to_table_stopers = {
 763                 'table': true
 764                 'template': true
 765                 'html': true
 766         }
 767         clear_stack_to_table_context = ->
 768                 loop
 769                         if clear_to_table_stopers[open_els[0].name]?
 770                                 break
 771                         open_els.shift()
 772                 return
 773         clear_to_table_body_stopers = {
 774                 tbody: NS_HTML
 775                 tfoot: NS_HTML
 776                 thead: NS_HTML
 777                 template: NS_HTML
 778                 html: NS_HTML
 779         }
 780         clear_stack_to_table_body_context = ->
 781                 loop
 782                         if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
 783                                 break
 784                         open_els.shift()
 785                 return
 786         clear_to_table_row_stopers = {
 787                 'tr': true
 788                 'template': true
 789                 'html': true
 790         }
 791         clear_stack_to_table_row_context = ->
 792                 loop
 793                         if clear_to_table_row_stopers[open_els[0].name]?
 794                                 break
 795                         open_els.shift()
 796                 return
 797         clear_afe_to_marker = ->
 798                 loop
 799                         return unless afe.length > 0 # this happens in fragment case, ?spec error
 800                         el = afe.shift()
 801                         if el.type is TYPE_AFE_MARKER
 802                                 return
 803                 return
 804
 805         # 8.2.3.1 ...
 806         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 807         reset_ins_mode = ->
 808                 # 1. Let last be false.
 809                 last = false
 810                 # 2. Let node be the last node in the stack of open elements.
 811                 node_i = 0
 812                 node = open_els[node_i]
 813                 # 3. Loop: If node is the first node in the stack of open elements,
 814                 # then set last to true, and, if the parser was originally created as
 815                 # part of the HTML fragment parsing algorithm (fragment case) set node
 816                 # to the context element.
 817                 loop
 818                         if node_i is open_els.length - 1
 819                                 last = true
 820                                 # fixfull (fragment case)
 821
 822                         # 4. If node is a select element, run these substeps:
 823                         if node.name is 'select' and node.namespace is NS_HTML
 824                                 # 1. If last is true, jump to the step below labeled done.
 825                                 unless last
 826                                         # 2. Let ancestor be node.
 827                                         ancestor_i = node_i
 828                                         ancestor = node
 829                                         # 3. Loop: If ancestor is the first node in the stack of
 830                                         # open elements, jump to the step below labeled done.
 831                                         loop
 832                                                 if ancestor_i is open_els.length - 1
 833                                                         break
 834                                                 # 4. Let ancestor be the node before ancestor in the stack
 835                                                 # of open elements.
 836                                                 ancestor_i += 1
 837                                                 ancestor = open_els[ancestor_i]
 838                                                 # 5. If ancestor is a template node, jump to the step below
 839                                                 # labeled done.
 840                                                 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
 841                                                         break
 842                                                 # 6. If ancestor is a table node, switch the insertion mode
 843                                                 # to "in select in table" and abort these steps.
 844                                                 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
 845                                                         ins_mode = ins_mode_in_select_in_table
 846                                                         return
 847                                                 # 7. Jump back to the step labeled loop.
 848                                 # 8. Done: Switch the insertion mode to "in select" and abort
 849                                 # these steps.
 850                                 ins_mode = ins_mode_in_select
 851                                 return
 852                         # 5. If node is a td or th element and last is false, then switch
 853                         # the insertion mode to "in cell" and abort these steps.
 854                         if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
 855                                 ins_mode = ins_mode_in_cell
 856                                 return
 857                         # 6. If node is a tr element, then switch the insertion mode to "in
 858                         # row" and abort these steps.
 859                         if node.name is 'tr' and node.namespace is NS_HTML
 860                                 ins_mode = ins_mode_in_row
 861                                 return
 862                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 863                         # insertion mode to "in table body" and abort these steps.
 864                         if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
 865                                 ins_mode = ins_mode_in_table_body
 866                                 return
 867                         # 8. If node is a caption element, then switch the insertion mode
 868                         # to "in caption" and abort these steps.
 869                         if node.name is 'caption' and node.namespace is NS_HTML
 870                                 ins_mode = ins_mode_in_caption
 871                                 return
 872                         # 9. If node is a colgroup element, then switch the insertion mode
 873                         # to "in column group" and abort these steps.
 874                         if node.name is 'colgroup' and node.namespace is NS_HTML
 875                                 ins_mode = ins_mode_in_column_group
 876                                 return
 877                         # 10. If node is a table element, then switch the insertion mode to
 878                         # "in table" and abort these steps.
 879                         if node.name is 'table' and node.namespace is NS_HTML
 880                                 ins_mode = ins_mode_in_table
 881                                 return
 882                         # 11. If node is a template element, then switch the insertion mode
 883                         # to the current template insertion mode and abort these steps.
 884                         if node.name is 'template' and node.namespace is NS_HTML
 885                                 ins_mode = template_ins_modes[0]
 886                                 return
 887                         # 12. If node is a head element and last is true, then switch the
 888                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 889                         # these steps. (fragment case)
 890                         if node.name is 'head' and node.namespace is NS_HTML and last
 891                                 ins_mode = ins_mode_in_body
 892                                 return
 893                         # 13. If node is a head element and last is false, then switch the
 894                         # insertion mode to "in head" and abort these steps.
 895                         if node.name is 'head' and node.namespace is NS_HTML and last is false
 896                                 ins_mode = ins_mode_in_head
 897                                 return
 898                         # 14. If node is a body element, then switch the insertion mode to
 899                         # "in body" and abort these steps.
 900                         if node.name is 'body' and node.namespace is NS_HTML
 901                                 ins_mode = ins_mode_in_body
 902                                 return
 903                         # 15. If node is a frameset element, then switch the insertion mode
 904                         # to "in frameset" and abort these steps. (fragment case)
 905                         if node.name is 'frameset' and node.namespace is NS_HTML
 906                                 ins_mode = ins_mode_in_frameset
 907                                 return
 908                         # 16. If node is an html element, run these substeps:
 909                         if node.name is 'html' and node.namespace is NS_HTML
 910                                 # 1. If the head element pointer is null, switch the insertion
 911                                 # mode to "before head" and abort these steps. (fragment case)
 912                                 if head_element_pointer is null
 913                                         ins_mode = ins_mode_before_head
 914                                 else
 915                                         # 2. Otherwise, the head element pointer is not null,
 916                                         # switch the insertion mode to "after head" and abort these
 917                                         # steps.
 918                                         ins_mode = ins_mode_after_head
 919                                 return
 920                         # 17. If last is true, then switch the insertion mode to "in body"
 921                         # and abort these steps. (fragment case)
 922                         if last
 923                                 ins_mode = ins_mode_in_body
 924                                 return
 925                         # 18. Let node now be the node before node in the stack of open
 926                         # elements.
 927                         node_i += 1
 928                         node = open_els[node_i]
 929                         # 19. Return to the step labeled loop.
 930
 931         # 8.2.3.2
 932
 933         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
 934         adjusted_current_node = ->
 935                 if open_els.length is 1 and flag_fragment_parsing
 936                         return context_element
 937                 return open_els[0]
 938
 939         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 940         # this implementation is structured (mostly) as described at the link above.
 941         # capitalized comments are the "labels" described at the link above.
 942         reconstruct_afe = ->
 943                 return if afe.length is 0
 944                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 945                         return
 946                 # Rewind
 947                 i = 0
 948                 loop
 949                         if i is afe.length - 1
 950                                 break
 951                         i += 1
 952                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 953                                 i -= 1 # Advance
 954                                 break
 955                 # Create
 956                 loop
 957                         el = insert_html_element afe[i].token
 958                         afe[i] = el
 959                         break if i is 0
 960                         i -= 1 # Advance
 961
 962         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 963         # adoption agency algorithm
 964         # overview here:
 965         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 966         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 967         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 968         adoption_agency = (subject) ->
 969                 debug_log "adoption_agency()"
 970                 debug_log "tree: #{serialize_els doc.children, false, true}"
 971                 debug_log "open_els: #{serialize_els open_els, true, true}"
 972                 debug_log "afe: #{serialize_els afe, true, true}"
 973 # this block implements tha W3C spec
 974 #               # 1. If the current node is an HTML element whose tag name is subject,
 975 #               # then run these substeps:
 976 #               #
 977 #               # 1. Let element be the current node.
 978 #               #
 979 #               # 2. Pop element off the stack of open elements.
 980 #               #
 981 #               # 3. If element is also in the list of active formatting elements,
 982 #               # remove the element from the list.
 983 #               #
 984 #               # 4. Abort the adoption agency algorithm.
 985 #               if open_els[0].name is subject and open_els[0].namespace is NS_HTML
 986 #                       el = open_els.shift()
 987 #                       # remove it from the list of active formatting elements (if found)
 988 #                       for t, i in afe
 989 #                               if t is el
 990 #                                       afe.splice i, 1
 991 #                                       break
 992 #                       debug_log "aaa: starting off with subject on top of stack, exiting"
 993 #                       return
 994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
 995                 # If the current node is an HTML element whose tag name is subject, and
 996                 # the current node is not in the list of active formatting elements,
 997                 # then pop the current node off the stack of open elements, and abort
 998                 # these steps.
 999                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000                         debug_log "aaa: starting off with subject on top of stack, exiting"
1001                         # remove it from the list of active formatting elements (if found)
1002                         in_afe = false
1003                         for el, i in afe
1004                                 if el is open_els[0]
1005                                         in_afe = true
1006                                         break
1007                         unless in_afe
1008                                 debug_log "aaa: ...and not in afe, aaa done"
1009                                 open_els.shift()
1010                                 return
1011                         # fall through
1012 # END WHATWG
1013                 outer = 0
1014                 loop
1015                         if outer >= 8
1016                                 return
1017                         outer += 1
1018                         # 5. Let formatting element be the last element in the list of
1019                         # active formatting elements that: is between the end of the list
1020                         # and the last scope marker in the list, if any, or the start of
1021                         # the list otherwise, and  has the tag name subject.
1022                         fe = null
1023                         for t, fe_of_afe in afe
1024                                 if t.type is TYPE_AFE_MARKER
1025                                         break
1026                                 if t.name is subject
1027                                         fe = t
1028                                         break
1029                         # If there is no such element, then abort these steps and instead
1030                         # act as described in the "any other end tag" entry above.
1031                         if fe is null
1032                                 debug_log "aaa: fe not found in afe"
1033                                 in_body_any_other_end_tag subject
1034                                 return
1035                         # 6. If formatting element is not in the stack of open elements,
1036                         # then this is a parse error; remove the element from the list, and
1037                         # abort these steps.
1038                         in_open_els = false
1039                         for t, fe_of_open_els in open_els
1040                                 if t is fe
1041                                         in_open_els = true
1042                                         break
1043                         unless in_open_els
1044                                 debug_log "aaa: fe not found in open_els"
1045                                 parse_error()
1046                                 # "remove it from the list" must mean afe, since it's not in open_els
1047                                 afe.splice fe_of_afe, 1
1048                                 return
1049                         # 7. If formatting element is in the stack of open elements, but
1050                         # the element is not in scope, then this is a parse error; abort
1051                         # these steps.
1052                         unless el_is_in_scope fe
1053                                 debug_log "aaa: fe not in scope"
1054                                 parse_error()
1055                                 return
1056                         # 8. If formatting element is not the current node, this is a parse
1057                         # error. (But do not abort these steps.)
1058                         unless open_els[0] is fe
1059                                 parse_error()
1060                                 # continue
1061                         # 9. Let furthest block be the topmost node in the stack of open
1062                         # elements that is lower in the stack than formatting element, and
1063                         # is an element in the special category. There might not be one.
1064                         fb = null
1065                         fb_of_open_els = null
1066                         for t, i in open_els
1067                                 if t is fe
1068                                         break
1069                                 if el_is_special t
1070                                         fb = t
1071                                         fb_of_open_els = i
1072                                         # and continue, to see if there's one that's more "topmost"
1073                         # 10. If there is no furthest block, then the UA must first pop all
1074                         # the nodes from the bottom of the stack of open elements, from the
1075                         # current node up to and including formatting element, then remove
1076                         # formatting element from the list of active formatting elements,
1077                         # and finally abort these steps.
1078                         if fb is null
1079                                 debug_log "aaa: no fb"
1080                                 loop
1081                                         t = open_els.shift()
1082                                         if t is fe
1083                                                 afe.splice fe_of_afe, 1
1084                                                 return
1085                         # 11. Let common ancestor be the element immediately above
1086                         # formatting element in the stack of open elements.
1087                         ca = open_els[fe_of_open_els + 1] # common ancestor
1088
1089                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091                         bookmark = new_aaa_bookmark()
1092                         for t, i in afe
1093                                 if t is fe
1094                                         afe.splice i, 0, bookmark
1095                                         break
1096                         node = last_node = fb
1097                         inner = 0
1098                         loop
1099                                 inner += 1
1100                                 # 3. Let node be the element immediately above node in the
1101                                 # stack of open elements, or if node is no longer in the stack
1102                                 # of open elements (e.g. because it got removed by this
1103                                 # algorithm), the element that was immediately above node in
1104                                 # the stack of open elements before node was removed.
1105                                 node_next = null
1106                                 for t, i in open_els
1107                                         if t is node
1108                                                 node_next = open_els[i + 1]
1109                                                 break
1110                                 node = node_next ? node_above
1111                                 debug_log "inner loop #{inner}"
1112                                 debug_log "tree: #{serialize_els doc.children, false, true}"
1113                                 debug_log "open_els: #{serialize_els open_els, true, true}"
1114                                 debug_log "afe: #{serialize_els afe, true, true}"
1115                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118                                 debug_log "node: #{node.serialize true, true}"
1119                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
1120
1121                                 # 4. If node is formatting element, then go to the next step in
1122                                 # the overall algorithm.
1123                                 if node is fe
1124                                         break
1125                                 debug_log "the meat"
1126                                 # 5. If inner loop counter is greater than three and node is in
1127                                 # the list of active formatting elements, then remove node from
1128                                 # the list of active formatting elements.
1129                                 node_in_afe = false
1130                                 for t, i in afe
1131                                         if t is node
1132                                                 if inner > 3
1133                                                         afe.splice i, 1
1134                                                         debug_log "max out inner"
1135                                                 else
1136                                                         node_in_afe = true
1137                                                         debug_log "in afe"
1138                                                 break
1139                                 # 6. If node is not in the list of active formatting elements,
1140                                 # then remove node from the stack of open elements and then go
1141                                 # back to the step labeled inner loop.
1142                                 unless node_in_afe
1143                                         debug_log "not in afe"
1144                                         for t, i in open_els
1145                                                 if t is node
1146                                                         node_above = open_els[i + 1]
1147                                                         open_els.splice i, 1
1148                                                         break
1149                                         continue
1150                                 debug_log "the bones"
1151                                 # 7. create an element for the token for which the element node
1152                                 # was created, in the HTML namespace, with common ancestor as
1153                                 # the intended parent; replace the entry for node in the list
1154                                 # of active formatting elements with an entry for the new
1155                                 # element, replace the entry for node in the stack of open
1156                                 # elements with an entry for the new element, and let node be
1157                                 # the new element.
1158                                 new_node = token_to_element node.token, NS_HTML, ca
1159                                 for t, i in afe
1160                                         if t is node
1161                                                 afe[i] = new_node
1162                                                 debug_log "replaced in afe"
1163                                                 break
1164                                 for t, i in open_els
1165                                         if t is node
1166                                                 node_above = open_els[i + 1]
1167                                                 open_els[i] = new_node
1168                                                 debug_log "replaced in open_els"
1169                                                 break
1170                                 node = new_node
1171                                 # 8. If last node is furthest block, then move the
1172                                 # aforementioned bookmark to be immediately after the new node
1173                                 # in the list of active formatting elements.
1174                                 if last_node is fb
1175                                         for t, i in afe
1176                                                 if t is bookmark
1177                                                         afe.splice i, 1
1178                                                         debug_log "removed bookmark"
1179                                                         break
1180                                         for t, i in afe
1181                                                 if t is node
1182                                                         # "after" means lower
1183                                                         afe.splice i, 0, bookmark # "after as <-
1184                                                         debug_log "placed bookmark after node"
1185                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1186                                                         break
1187                                 # 9. Insert last node into node, first removing it from its
1188                                 # previous parent node if any.
1189                                 if last_node.parent?
1190                                         debug_log "last_node has parent"
1191                                         for c, i in last_node.parent.children
1192                                                 if c is last_node
1193                                                         debug_log "removing last_node from parent"
1194                                                         last_node.parent.children.splice i, 1
1195                                                         break
1196                                 node.children.push last_node
1197                                 last_node.parent = node
1198                                 # 10. Let last node be node.
1199                                 last_node = node
1200                                 debug_log "at last"
1201                                 # 11. Return to the step labeled inner loop.
1202                         # 14. Insert whatever last node ended up being in the previous step
1203                         # at the appropriate place for inserting a node, but using common
1204                         # ancestor as the override target.
1205
1206                         # In the case where fe is immediately followed by fb:
1207                         #   * inner loop exits out early (node==fe)
1208                         #   * last_node is fb
1209                         #   * last_node is still in the tree (not a duplicate)
1210                         if last_node.parent?
1211                                 debug_log "FEFIRST? last_node has parent"
1212                                 for c, i in last_node.parent.children
1213                                         if c is last_node
1214                                                 debug_log "removing last_node from parent"
1215                                                 last_node.parent.children.splice i, 1
1216                                                 break
1217
1218                         debug_log "after aaa inner loop"
1219                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223                         debug_log "tree: #{serialize_els doc.children, false, true}"
1224
1225                         debug_log "insert"
1226
1227
1228                         # can't use standard insert token thing, because it's already in
1229                         # open_els and must stay at it's current position in open_els
1230                         dest = adjusted_insertion_location ca
1231                         dest[0].children.splice dest[1], 0, last_node
1232                         last_node.parent = dest[0]
1233
1234
1235                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239                         debug_log "tree: #{serialize_els doc.children, false, true}"
1240
1241                         # 15. Create an element for the token for which formatting element
1242                         # was created, in the HTML namespace, with furthest block as the
1243                         # intended parent.
1244                         new_element = token_to_element fe.token, NS_HTML, fb
1245                         # 16. Take all of the child nodes of furthest block and append them
1246                         # to the element created in the last step.
1247                         while fb.children.length
1248                                 t = fb.children.shift()
1249                                 t.parent = new_element
1250                                 new_element.children.push t
1251                         # 17. Append that new element to furthest block.
1252                         new_element.parent = fb
1253                         fb.children.push new_element
1254                         # 18. Remove formatting element from the list of active formatting
1255                         # elements, and insert the new element into the list of active
1256                         # formatting elements at the position of the aforementioned
1257                         # bookmark.
1258                         for t, i in afe
1259                                 if t is fe
1260                                         afe.splice i, 1
1261                                         break
1262                         for t, i in afe
1263                                 if t is bookmark
1264                                         afe[i] = new_element
1265                                         break
1266                         # 19. Remove formatting element from the stack of open elements,
1267                         # and insert the new element into the stack of open elements
1268                         # immediately below the position of furthest block in that stack.
1269                         for t, i in open_els
1270                                 if t is fe
1271                                         open_els.splice i, 1
1272                                         break
1273                         for t, i in open_els
1274                                 if t is fb
1275                                         open_els.splice i, 0, new_element
1276                                         break
1277                         # 20. Jump back to the step labeled outer loop.
1278                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279                         debug_log "tree: #{serialize_els doc.children, false, true}"
1280                         debug_log "open_els: #{serialize_els open_els, true, true}"
1281                         debug_log "afe: #{serialize_els afe, true, true}"
1282                 debug_log "AAA DONE"
1283
1284         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285         close_p_element = ->
1286                 generate_implied_end_tags 'p' # arg is exception
1287                 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1288                         parse_error()
1289                 while open_els.length > 1 # just in case
1290                         el = open_els.shift()
1291                         if el.name is 'p' and el.namespace is NS_HTML
1292                                 return
1293         close_p_if_in_button_scope = ->
1294                 if is_in_button_scope 'p', NS_HTML
1295                         close_p_element()
1296
1297         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298         # aka insert_a_character = (t) ->
1299         insert_character = (t) ->
1300                 dest = adjusted_insertion_location()
1301                 # fixfull check for Document node
1302                 if dest[1] > 0
1303                         prev = dest[0].children[dest[1] - 1]
1304                         if prev.type is TYPE_TEXT
1305                                 prev.text += t.text
1306                                 return
1307                 dest[0].children.splice dest[1], 0, t
1308
1309
1310         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311         process_token = (t) ->
1312                 acn = adjusted_current_node()
1313                 unless acn?
1314                         ins_mode t
1315                         return
1316                 if acn.namespace is NS_HTML
1317                         ins_mode t
1318                         return
1319                 if is_mathml_text_integration_point(acn)
1320                         if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1321                                 ins_mode t
1322                                 return
1323                         if t.type is TYPE_TEXT
1324                                 ins_mode t
1325                                 return
1326                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1327                         ins_mode t
1328                         return
1329                 if is_html_integration acn
1330                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1331                                 ins_mode t
1332                                 return
1333                 if t.type is TYPE_EOF
1334                         ins_mode t
1335                         return
1336                 in_foreign_content t
1337                 return
1338
1339         # 8.2.5.1
1340         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342         adjusted_insertion_location = (override_target = null) ->
1343                 # 1. If there was an override target specified, then let target be the
1344                 # override target.
1345                 if override_target?
1346                         target = override_target
1347                 else # Otherwise, let target be the current node.
1348                         target = open_els[0]
1349                 # 2. Determine the adjusted insertion location using the first matching
1350                 # steps from the following list:
1351                 #
1352                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353                 # thead, or tr element Foster parenting happens when content is
1354                 # misnested in tables.
1355                 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356                         loop # once. this is here so we can ``break`` to "abort these substeps"
1357                                 # 1. Let last template be the last template element in the
1358                                 # stack of open elements, if any.
1359                                 last_template = null
1360                                 last_template_i = null
1361                                 for el, i in open_els
1362                                         if el.name is 'template' and el.namespace is NS_HTML
1363                                                 last_template = el
1364                                                 last_template_i = i
1365                                                 break
1366                                 # 2. Let last table be the last table element in the stack of
1367                                 # open elements, if any.
1368                                 last_table = null
1369                                 last_table_i
1370                                 for el, i in open_els
1371                                         if el.name is 'table' and el.namespace is NS_HTML
1372                                                 last_table = el
1373                                                 last_table_i = i
1374                                                 break
1375                                 # 3. If there is a last template and either there is no last
1376                                 # table, or there is one, but last template is lower (more
1377                                 # recently added) than last table in the stack of open
1378                                 # elements, then: let adjusted insertion location be inside
1379                                 # last template's template contents, after its last child (if
1380                                 # any), and abort these substeps.
1381                                 if last_template and (last_table is null or last_template_i < last_table_i)
1382                                         target = last_template # fixfull should be it's contents
1383                                         target_i = target.children.length
1384                                         break
1385                                 # 4. If there is no last table, then let adjusted insertion
1386                                 # location be inside the first element in the stack of open
1387                                 # elements (the html element), after its last child (if any),
1388                                 # and abort these substeps. (fragment case)
1389                                 if last_table is null
1390                                         # this is odd
1391                                         target = open_els[open_els.length - 1]
1392                                         target_i = target.children.length
1393                                         break
1394                                 # 5. If last table has a parent element, then let adjusted
1395                                 # insertion location be inside last table's parent element,
1396                                 # immediately before last table, and abort these substeps.
1397                                 if last_table.parent?
1398                                         for c, i in last_table.parent.children
1399                                                 if c is last_table
1400                                                         target = last_table.parent
1401                                                         target_i = i
1402                                                         break
1403                                         break
1404                                 # 6. Let previous element be the element immediately above last
1405                                 # table in the stack of open elements.
1406                                 #
1407                                 # huh? how could it not have a parent?
1408                                 previous_element = open_els[last_table_i + 1]
1409                                 # 7. Let adjusted insertion location be inside previous
1410                                 # element, after its last child (if any).
1411                                 target = previous_element
1412                                 target_i = target.children.length
1413                                 # Note: These steps are involved in part because it's possible
1414                                 # for elements, the table element in this case in particular,
1415                                 # to have been moved by a script around in the DOM, or indeed
1416                                 # removed from the DOM entirely, after the element was inserted
1417                                 # by the parser.
1418                                 break # don't really loop
1419                 else
1420                         # Otherwise Let adjusted insertion location be inside target, after
1421                         # its last child (if any).
1422                         target_i = target.children.length
1423
1424                 # 3. If the adjusted insertion location is inside a template element,
1425                 # let it instead be inside the template element's template contents,
1426                 # after its last child (if any).
1427                 # fixfull (template)
1428
1429                 # 4. Return the adjusted insertion location.
1430                 return [target, target_i]
1431
1432         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433         # aka create_an_element_for_token
1434         token_to_element = (t, namespace, intended_parent) ->
1435                 # convert attributes into a hash
1436                 attrs = {}
1437                 for a in t.attrs_a
1438                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1440
1441                 # TODO 2. If the newly created element has an xmlns attribute in the
1442                 # XMLNS namespace whose value is not exactly the same as the element's
1443                 # namespace, that is a parse error. Similarly, if the newly created
1444                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445                 # value is not the XLink Namespace, that is a parse error.
1446
1447                 # fixfull: the spec says stuff about form pointers and ownerDocument
1448
1449                 return el
1450
1451         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452         insert_foreign_element = (token, namespace) ->
1453                 ail = adjusted_insertion_location()
1454                 ail_el = ail[0]
1455                 ail_i = ail[1]
1456                 el = token_to_element token, namespace, ail_el
1457                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1458                 el.parent = ail_el
1459                 ail_el.children.splice ail_i, 0, el
1460                 open_els.unshift el
1461                 return el
1462         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463         insert_html_element = (token) ->
1464                 insert_foreign_element token, NS_HTML
1465
1466         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467         # position should be [node, index_within_children]
1468         insert_comment = (t, position = null) ->
1469                 position ?= adjusted_insertion_location()
1470                 position[0].children.splice position[1], 0, t
1471
1472         # 8.2.5.2
1473         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474         parse_generic_raw_text = (t) ->
1475                 insert_html_element t
1476                 tok_state = tok_state_rawtext
1477                 original_ins_mode = ins_mode
1478                 ins_mode = ins_mode_text
1479         parse_generic_rcdata_text = (t) ->
1480                 insert_html_element t
1481                 tok_state = tok_state_rcdata
1482                 original_ins_mode = ins_mode
1483                 ins_mode = ins_mode_text
1484
1485         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487         generate_implied_end_tags = (except = null) ->
1488                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1489                         open_els.shift()
1490
1491         # 8.2.5.4 The rules for parsing tokens in HTML content
1492         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1493
1494         # 8.2.5.4.1 The "initial" insertion mode
1495         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496         is_quirks_yes_doctype = (t) ->
1497                 if t.flag 'force-quirks'
1498                         return true
1499                 if t.name isnt 'html'
1500                         return true
1501                 if t.public_identifier?
1502                         pi = t.public_identifier.toLowerCase()
1503                         for p in quirks_yes_pi_prefixes
1504                                 if pi.substr(0, p.length) is p
1505                                         return true
1506                         if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1507                                 return true
1508                 if t.system_identifier?
1509                         if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1510                                 return true
1511                 else if t.public_identifier?
1512                         # already did this: pi = t.public_identifier.toLowerCase()
1513                         if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1514                                 return true
1515                 return false
1516         is_quirks_limited_doctype = (t) ->
1517                 if t.public_identifier?
1518                         pi = t.public_identifier.toLowerCase()
1519                         if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1520                                 return true
1521                         if t.system_identifier?
1522                                 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1523                                         return true
1524                 return false
1525         ins_mode_initial = (t) ->
1526                 if is_space_tok t
1527                         return
1528                 if t.type is TYPE_COMMENT
1529                         # ?fixfull
1530                         doc.children.push t
1531                         return
1532                 if t.type is TYPE_DOCTYPE
1533                         # fixfull syntax error from first paragraph and following bullets
1534                         # fixfull set doc.doctype
1535                         # fixfull is the "not an iframe srcdoc" thing relevant?
1536                         if is_quirks_yes_doctype t
1537                                 doc.flag 'quirks mode', QUIRKS_YES
1538                         else if is_quirks_limited_doctype t
1539                                 doc.flag 'quirks mode', QUIRKS_LIMITED
1540                         doc.children.push t
1541                         ins_mode = ins_mode_before_html
1542                         return
1543                 # Anything else
1544                 # fixfull not iframe srcdoc?
1545                 parse_error()
1546                 doc.flag 'quirks mode', QUIRKS_YES
1547                 ins_mode = ins_mode_before_html
1548                 process_token t
1549                 return
1550
1551         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552         ins_mode_before_html = (t) ->
1553                 if t.type is TYPE_DOCTYPE
1554                         parse_error()
1555                         return
1556                 if t.type is TYPE_COMMENT
1557                         doc.children.push t
1558                         return
1559                 if is_space_tok t
1560                         return
1561                 if t.type is TYPE_START_TAG and t.name is 'html'
1562                         el = token_to_element t, NS_HTML, doc
1563                         doc.children.push el
1564                         open_els.unshift(el)
1565                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1566                         ins_mode = ins_mode_before_head
1567                         return
1568                 if t.type is TYPE_END_TAG
1569                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1570                                 # fall through to "anything else"
1571                         else
1572                                 parse_error()
1573                                 return
1574                 # Anything else
1575                 el = token_to_element new_open_tag('html'), NS_HTML, doc
1576                 doc.children.push el
1577                 el.parent = doc
1578                 open_els.unshift el
1579                 # ?fixfull browsing context
1580                 ins_mode = ins_mode_before_head
1581                 process_token t
1582                 return
1583
1584         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1585         ins_mode_before_head = (t) ->
1586                 if is_space_tok t
1587                         return
1588                 if t.type is TYPE_COMMENT
1589                         insert_comment t
1590                         return
1591                 if t.type is TYPE_DOCTYPE
1592                         parse_error()
1593                         return
1594                 if t.type is TYPE_START_TAG and t.name is 'html'
1595                         ins_mode_in_body t
1596                         return
1597                 if t.type is TYPE_START_TAG and t.name is 'head'
1598                         el = insert_html_element t
1599                         head_element_pointer = el
1600                         ins_mode = ins_mode_in_head
1601                         return
1602                 if t.type is TYPE_END_TAG
1603                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1604                                 # fall through to Anything else below
1605                         else
1606                                 parse_error()
1607                                 return
1608                 # Anything else
1609                 el = insert_html_element new_open_tag 'head'
1610                 head_element_pointer = el
1611                 ins_mode = ins_mode_in_head
1612                 process_token t
1613
1614         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1615         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1616                 open_els.shift() # spec says this will be a 'head' node
1617                 ins_mode = ins_mode_after_head
1618                 process_token t
1619         ins_mode_in_head = (t) ->
1620                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1621                         insert_character t
1622                         return
1623                 if t.type is TYPE_COMMENT
1624                         insert_comment t
1625                         return
1626                 if t.type is TYPE_DOCTYPE
1627                         parse_error()
1628                         return
1629                 if t.type is TYPE_START_TAG and t.name is 'html'
1630                         ins_mode_in_body t
1631                         return
1632                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1633                         el = insert_html_element t
1634                         open_els.shift()
1635                         t.acknowledge_self_closing()
1636                         return
1637                 if t.type is TYPE_START_TAG and t.name is 'meta'
1638                         el = insert_html_element t
1639                         open_els.shift()
1640                         t.acknowledge_self_closing()
1641                         # fixfull encoding stuff
1642                         return
1643                 if t.type is TYPE_START_TAG and t.name is 'title'
1644                         parse_generic_rcdata_text t
1645                         return
1646                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1647                         parse_generic_raw_text t
1648                         return
1649                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1650                         insert_html_element t
1651                         ins_mode = ins_mode_in_head_noscript
1652                         return
1653                 if t.type is TYPE_START_TAG and t.name is 'script'
1654                         ail = adjusted_insertion_location()
1655                         el = token_to_element t, NS_HTML, ail
1656                         el.flag 'parser-inserted', true
1657                         # fixfull frament case
1658                         ail[0].children.splice ail[1], 0, el
1659                         open_els.unshift el
1660                         tok_state = tok_state_script_data
1661                         original_ins_mode = ins_mode # make sure orig... is defined
1662                         ins_mode = ins_mode_text
1663                         return
1664                 if t.type is TYPE_END_TAG and t.name is 'head'
1665                         open_els.shift() # will be a head element... spec says so
1666                         ins_mode = ins_mode_after_head
1667                         return
1668                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1669                         ins_mode_in_head_else t
1670                         return
1671                 if t.type is TYPE_START_TAG and t.name is 'template'
1672                         insert_html_element t
1673                         afe_push_marker()
1674                         flag_frameset_ok = false
1675                         ins_mode = ins_mode_in_template
1676                         template_ins_modes.unshift ins_mode_in_template
1677                         return
1678                 if t.type is TYPE_END_TAG and t.name is 'template'
1679                         if template_tag_is_open()
1680                                 generate_implied_end_tags
1681                                 if open_els[0].name isnt 'template'
1682                                         parse_error()
1683                                 loop
1684                                         el = open_els.shift()
1685                                         if el.name is 'template' and el.namespace is NS_HTML
1686                                                 break
1687                                 clear_afe_to_marker()
1688                                 template_ins_modes.shift()
1689                                 reset_ins_mode()
1690                         else
1691                                 parse_error()
1692                         return
1693                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1694                         parse_error()
1695                         return
1696                 ins_mode_in_head_else t
1697
1698         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1699         ins_mode_in_head_noscript_else = (t) ->
1700                 parse_error()
1701                 open_els.shift()
1702                 ins_mode = ins_mode_in_head
1703                 process_token t
1704         ins_mode_in_head_noscript = (t) ->
1705                 if t.type is TYPE_DOCTYPE
1706                         parse_error()
1707                         return
1708                 if t.type is TYPE_START_TAG and t.name is 'html'
1709                         ins_mode_in_body t
1710                         return
1711                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1712                         open_els.shift()
1713                         ins_mode = ins_mode_in_head
1714                         return
1715                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1716                         ins_mode_in_head t
1717                         return
1718                 if t.type is TYPE_END_TAG and t.name is 'br'
1719                         ins_mode_in_head_noscript_else t
1720                         return
1721                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1722                         parse_error()
1723                         return
1724                 # Anything else
1725                 ins_mode_in_head_noscript_else t
1726                 return
1727
1728
1729
1730         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1731         ins_mode_after_head_else = (t) ->
1732                 body_tok = new_open_tag 'body'
1733                 insert_html_element body_tok
1734                 ins_mode = ins_mode_in_body
1735                 process_token t
1736                 return
1737         ins_mode_after_head = (t) ->
1738                 if is_space_tok t
1739                         insert_character t
1740                         return
1741                 if t.type is TYPE_COMMENT
1742                         insert_comment t
1743                         return
1744                 if t.type is TYPE_DOCTYPE
1745                         parse_error()
1746                         return
1747                 if t.type is TYPE_START_TAG and t.name is 'html'
1748                         ins_mode_in_body t
1749                         return
1750                 if t.type is TYPE_START_TAG and t.name is 'body'
1751                         insert_html_element t
1752                         flag_frameset_ok = false
1753                         ins_mode = ins_mode_in_body
1754                         return
1755                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1756                         insert_html_element t
1757                         ins_mode = ins_mode_in_frameset
1758                         return
1759                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1760                         parse_error()
1761                         open_els.unshift head_element_pointer
1762                         ins_mode_in_head t
1763                         for el, i in open_els
1764                                 if el is head_element_pointer
1765                                         open_els.splice i, 1
1766                                         return
1767                         console.log "warning: 23904 couldn't find head element in open_els"
1768                         return
1769                 if t.type is TYPE_END_TAG and t.name is 'template'
1770                         ins_mode_in_head t
1771                         return
1772                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1773                         ins_mode_after_head_else t
1774                         return
1775                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1776                         parse_error()
1777                         return
1778                 # Anything else
1779                 ins_mode_after_head_else t
1780
1781         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1782         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1783                 for el, i in open_els
1784                         if el.name is name and el.namespace is NS_HTML
1785                                 generate_implied_end_tags name # arg is exception
1786                                 parse_error() unless i is 0
1787                                 while i >= 0
1788                                         open_els.shift()
1789                                         i -= 1
1790                                 return
1791                         if special_elements[el.name] is el.namespace
1792                                 parse_error()
1793                                 return
1794                 return
1795         ins_mode_in_body = (t) ->
1796                 if t.type is TYPE_TEXT and t.text is "\u0000"
1797                         parse_error()
1798                         return
1799                 if is_space_tok t
1800                         reconstruct_afe()
1801                         insert_character t
1802                         return
1803                 if t.type is TYPE_TEXT
1804                         reconstruct_afe()
1805                         insert_character t
1806                         flag_frameset_ok = false
1807                         return
1808                 if t.type is TYPE_COMMENT
1809                         insert_comment t
1810                         return
1811                 if t.type is TYPE_DOCTYPE
1812                         parse_error()
1813                         return
1814                 if t.type is TYPE_START_TAG and t.name is 'html'
1815                         parse_error()
1816                         return if template_tag_is_open()
1817                         root_attrs = open_els[open_els.length - 1].attrs
1818                         for a in t.attrs_a
1819                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1820                         return
1821
1822                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1823                         ins_mode_in_head t
1824                         return
1825                 if t.type is TYPE_START_TAG and t.name is 'body'
1826                         parse_error()
1827                         return if open_els.length < 2
1828                         second = open_els[open_els.length - 2]
1829                         return unless second.namespace is NS_HTML
1830                         return unless second.name is 'body'
1831                         return if template_tag_is_open()
1832                         flag_frameset_ok = false
1833                         for a in t.attrs_a
1834                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1835                         return
1836                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1837                         parse_error()
1838                         return if open_els.length < 2
1839                         second_i = open_els.length - 2
1840                         second = open_els[second_i]
1841                         return unless second.namespace is NS_HTML
1842                         return unless second.name is 'body'
1843                         if flag_frameset_ok is false
1844                                 return
1845                         if second.parent?
1846                                 for el, i in second.parent.children
1847                                         if el is second
1848                                                 second.parent.children.splice i, 1
1849                                                 break
1850                         open_els.splice second_i, 1
1851                         # pop everything except the "root html element"
1852                         while open_els.length > 1
1853                                 open_els.shift()
1854                         insert_html_element t
1855                         ins_mode = ins_mode_in_frameset
1856                         return
1857                 if t.type is TYPE_EOF
1858                         ok_tags = {
1859                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1860                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1861                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1862                         }
1863                         for el in open_els
1864                                 unless ok_tags[t.name] is el.namespace
1865                                         parse_error()
1866                                         break
1867                         if template_ins_modes.length > 0
1868                                 ins_mode_in_template t
1869                         else
1870                                 stop_parsing()
1871                         return
1872                 if t.type is TYPE_END_TAG and t.name is 'body'
1873                         unless is_in_scope 'body', NS_HTML
1874                                 parse_error()
1875                                 return
1876                         ok_tags = {
1877                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1878                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1879                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1880                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1881                                 html:NS_HTML
1882                         }
1883                         for el in open_els
1884                                 unless ok_tags[t.name] is el.namespace
1885                                         parse_error()
1886                                         break
1887                         ins_mode = ins_mode_after_body
1888                         return
1889                 if t.type is TYPE_END_TAG and t.name is 'html'
1890                         unless is_in_scope 'body', NS_HTML
1891                                 parse_error()
1892                                 return
1893                         ok_tags = {
1894                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1895                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1896                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1897                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1898                                 html:NS_HTML
1899                         }
1900                         for el in open_els
1901                                 unless ok_tags[t.name] is el.namespace
1902                                         parse_error()
1903                                         break
1904                         ins_mode = ins_mode_after_body
1905                         process_token t
1906                         return
1907                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1908                         close_p_if_in_button_scope()
1909                         insert_html_element t
1910                         return
1911                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1912                         close_p_if_in_button_scope()
1913                         if h_tags[open_els[0].name] is open_els[0].namespace
1914                                 parse_error()
1915                                 open_els.shift()
1916                         insert_html_element t
1917                         return
1918                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1919                         close_p_if_in_button_scope()
1920                         insert_html_element t
1921                         # spec: If the next token is a "LF" (U+000A) character token, then
1922                         # ignore that token and move on to the next one. (Newlines at the
1923                         # start of pre blocks are ignored as an authoring convenience.)
1924                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1925                                 cur += 1
1926                         flag_frameset_ok = false
1927                         return
1928                 if t.type is TYPE_START_TAG and t.name is 'form'
1929                         unless form_element_pointer is null or template_tag_is_open()
1930                                 parse_error()
1931                                 return
1932                         close_p_if_in_button_scope()
1933                         el = insert_html_element t
1934                         unless template_tag_is_open()
1935                                 form_element_pointer = el
1936                         return
1937                 if t.type is TYPE_START_TAG and t.name is 'li'
1938                         flag_frameset_ok = false
1939                         for node in open_els
1940                                 if node.name is 'li' and node.namespace is NS_HTML
1941                                         generate_implied_end_tags 'li' # arg is exception
1942                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1943                                                 parse_error()
1944                                         loop
1945                                                 el = open_els.shift()
1946                                                 if el.name is 'li' and el.namespace is NS_HTML
1947                                                         break
1948                                         break
1949                                 if el_is_special_not_adp node
1950                                                 break
1951                         close_p_if_in_button_scope()
1952                         insert_html_element t
1953                         return
1954                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1955                         flag_frameset_ok = false
1956                         for node in open_els
1957                                 if node.name is 'dd' and node.namespace is NS_HTML
1958                                         generate_implied_end_tags 'dd' # arg is exception
1959                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1960                                                 parse_error()
1961                                         loop
1962                                                 el = open_els.shift()
1963                                                 if el.name is 'dd' and el.namespace is NS_HTML
1964                                                         break
1965                                         break
1966                                 if node.name is 'dt' and node.namespace is NS_HTML
1967                                         generate_implied_end_tags 'dt' # arg is exception
1968                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1969                                                 parse_error()
1970                                         loop
1971                                                 el = open_els.shift()
1972                                                 if el.name is 'dt' and el.namespace is NS_HTML
1973                                                         break
1974                                         break
1975                                 if el_is_special_not_adp node
1976                                         break
1977                         close_p_if_in_button_scope()
1978                         insert_html_element t
1979                         return
1980                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1981                         close_p_if_in_button_scope()
1982                         insert_html_element t
1983                         tok_state = tok_state_plaintext
1984                         return
1985                 if t.type is TYPE_START_TAG and t.name is 'button'
1986                         if is_in_scope 'button', NS_HTML
1987                                 parse_error()
1988                                 generate_implied_end_tags()
1989                                 loop
1990                                         el = open_els.shift()
1991                                         if el.name is 'button' and el.namespace is NS_HTML
1992                                                 break
1993                         reconstruct_afe()
1994                         insert_html_element t
1995                         flag_frameset_ok = false
1996                         return
1997                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1998                         unless is_in_scope t.name, NS_HTML
1999                                 parse_error()
2000                                 return
2001                         generate_implied_end_tags()
2002                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2003                                 parse_error()
2004                         loop
2005                                 el = open_els.shift()
2006                                 if el.name is t.name and el.namespace is NS_HTML
2007                                         return
2008                         return
2009                 if t.type is TYPE_END_TAG and t.name is 'form'
2010                         unless template_tag_is_open()
2011                                 node = form_element_pointer
2012                                 form_element_pointer = null
2013                                 if node is null or not el_is_in_scope node
2014                                         parse_error()
2015                                         return
2016                                 generate_implied_end_tags()
2017                                 if open_els[0] isnt node
2018                                         parse_error()
2019                                 for el, i in open_els
2020                                         if el is node
2021                                                 open_els.splice i, 1
2022                                                 break
2023                         else
2024                                 unless is_in_scope 'form', NS_HTML
2025                                         parse_error()
2026                                         return
2027                                 generate_implied_end_tags()
2028                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2029                                         parse_error()
2030                                 loop
2031                                         el = open_els.shift()
2032                                         if el.name is 'form' and el.namespace is NS_HTML
2033                                                 break
2034                         return
2035                 if t.type is TYPE_END_TAG and t.name is 'p'
2036                         unless is_in_button_scope 'p', NS_HTML
2037                                 parse_error()
2038                                 insert_html_element new_open_tag 'p'
2039                         close_p_element()
2040                         return
2041                 if t.type is TYPE_END_TAG and t.name is 'li'
2042                         unless is_in_li_scope 'li', NS_HTML
2043                                 parse_error()
2044                                 return
2045                         generate_implied_end_tags 'li' # arg is exception
2046                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2047                                 parse_error()
2048                         loop
2049                                 el = open_els.shift()
2050                                 if el.name is 'li' and el.namespace is NS_HTML
2051                                         break
2052                         return
2053                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2054                         unless is_in_scope t.name, NS_HTML
2055                                 parse_error()
2056                                 return
2057                         generate_implied_end_tags t.name # arg is exception
2058                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2059                                 parse_error()
2060                         loop
2061                                 el = open_els.shift()
2062                                 if el.name is t.name and el.namespace is NS_HTML
2063                                         break
2064                         return
2065                 if t.type is TYPE_END_TAG and h_tags[t.name]?
2066                         h_in_scope = false
2067                         for el in open_els
2068                                 if h_tags[el.name] is el.namespace
2069                                         h_in_scope = true
2070                                         break
2071                                 if standard_scopers[el.name] is el.namespace
2072                                         break
2073                         unless h_in_scope
2074                                 parse_error()
2075                                 return
2076                         generate_implied_end_tags()
2077                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2078                                 parse_error()
2079                         loop
2080                                 el = open_els.shift()
2081                                 if h_tags[el.name] is el.namespace
2082                                         break
2083                         return
2084                 # deep breath!
2085                 if t.type is TYPE_START_TAG and t.name is 'a'
2086                         # If the list of active formatting elements contains an a element
2087                         # between the end of the list and the last marker on the list (or
2088                         # the start of the list if there is no marker on the list), then
2089                         # this is a parse error; run the adoption agency algorithm for the
2090                         # tag name "a", then remove that element from the list of active
2091                         # formatting elements and the stack of open elements if the
2092                         # adoption agency algorithm didn't already remove it (it might not
2093                         # have if the element is not in table scope).
2094                         found = false
2095                         for el in afe
2096                                 if el.type is TYPE_AFE_MARKER
2097                                         break
2098                                 if el.name is 'a' and el.namespace is NS_HTML
2099                                         found = el
2100                         if found?
2101                                 parse_error()
2102                                 adoption_agency 'a'
2103                                 for el, i in afe
2104                                         if el is found
2105                                                 afe.splice i, 1
2106                                 for el, i in open_els
2107                                         if el is found
2108                                                 open_els.splice i, 1
2109                         reconstruct_afe()
2110                         el = insert_html_element t
2111                         afe_push el
2112                         return
2113                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2114                         reconstruct_afe()
2115                         el = insert_html_element t
2116                         afe_push el
2117                         return
2118                 if t.type is TYPE_START_TAG and t.name is 'nobr'
2119                         reconstruct_afe()
2120                         el = insert_html_element t
2121                         afe_push el
2122                         return
2123                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2124                         adoption_agency t.name
2125                         return
2126                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2127                         reconstruct_afe()
2128                         insert_html_element t
2129                         afe_push_marker()
2130                         flag_frameset_ok = false
2131                         return
2132                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2133                         unless is_in_scope t.name, NS_HTML
2134                                 parse_error()
2135                                 return
2136                         generate_implied_end_tags()
2137                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2138                                 parse_error()
2139                         loop
2140                                 el = open_els.shift()
2141                                 if el.name is t.name and el.namespace is NS_HTML
2142                                         break
2143                         clear_afe_to_marker()
2144                         return
2145                 if t.type is TYPE_START_TAG and t.name is 'table'
2146                         unless doc.flag('quirks mode') is QUIRKS_YES
2147                                 close_p_if_in_button_scope() # test
2148                         insert_html_element t
2149                         flag_frameset_ok = false
2150                         ins_mode = ins_mode_in_table
2151                         return
2152                 if t.type is TYPE_END_TAG and t.name is 'br'
2153                         parse_error()
2154                         t.type = TYPE_START_TAG
2155                         # fall through
2156                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2157                         reconstruct_afe()
2158                         insert_html_element t
2159                         open_els.shift()
2160                         t.acknowledge_self_closing()
2161                         flag_frameset_ok = false
2162                         return
2163                 if t.type is TYPE_START_TAG and t.name is 'input'
2164                         reconstruct_afe()
2165                         insert_html_element t
2166                         open_els.shift()
2167                         t.acknowledge_self_closing()
2168                         unless is_input_hidden_tok t
2169                                 flag_frameset_ok = false
2170                         return
2171                 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2172                         insert_html_element t
2173                         open_els.shift()
2174                         t.acknowledge_self_closing()
2175                         return
2176                 if t.type is TYPE_START_TAG and t.name is 'hr'
2177                         close_p_if_in_button_scope()
2178                         insert_html_element t
2179                         open_els.shift()
2180                         t.acknowledge_self_closing()
2181                         flag_frameset_ok = false
2182                         return
2183                 if t.type is TYPE_START_TAG and t.name is 'image'
2184                         parse_error()
2185                         t.name = 'img'
2186                         process_token t
2187                         return
2188                 if t.type is TYPE_START_TAG and t.name is 'isindex'
2189                         parse_error()
2190                         if template_tag_is_open() is false and form_element_pointer isnt null
2191                                 return
2192                         t.acknowledge_self_closing()
2193                         flag_frameset_ok = false
2194                         close_p_if_in_button_scope()
2195                         el = insert_html_element new_open_tag 'form'
2196                         unless template_tag_is_open()
2197                                 form_element_pointer = el
2198                         for a in t.attrs_a
2199                                 if a[0] is 'action'
2200                                         el.attrs['action'] = a[1]
2201                                         break
2202                         insert_html_element new_open_tag 'hr'
2203                         open_els.shift()
2204                         reconstruct_afe()
2205                         insert_html_element new_open_tag 'label'
2206                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2207                         input_el = new_open_tag 'input'
2208                         prompt = null
2209                         for a in t.attrs_a
2210                                 if a[0] is 'prompt'
2211                                         prompt = a[1]
2212                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2213                                         input_el.attrs_a.push [a[0], a[1]]
2214                         input_el.attrs_a.push ['name', 'isindex']
2215                         # fixfull this next bit is in english... internationalize?
2216                         prompt ?= "This is a searchable index. Enter search keywords: "
2217                         insert_character new_character_token prompt # fixfull split
2218                         # TODO submit typo "balue" in spec
2219                         insert_html_element input_el
2220                         open_els.shift()
2221                         # insert_character '' # you can put chars here if promt attr missing
2222                         open_els.shift()
2223                         insert_html_element new_open_tag 'hr'
2224                         open_els.shift()
2225                         open_els.shift()
2226                         unless template_tag_is_open()
2227                                 form_element_pointer = null
2228                         return
2229                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2230                         insert_html_element t
2231                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2232                                 cur += 1
2233                         tok_state = tok_state_rcdata
2234                         original_ins_mode = ins_mode
2235                         flag_frameset_ok = false
2236                         ins_mode = ins_mode_text
2237                         return
2238                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2239                         close_p_if_in_button_scope()
2240                         reconstruct_afe()
2241                         flag_frameset_ok = false
2242                         parse_generic_raw_text t
2243                         return
2244                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2245                         flag_frameset_ok = false
2246                         parse_generic_raw_text t
2247                         return
2248                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2249                         parse_generic_raw_text t
2250                         return
2251                 if t.type is TYPE_START_TAG and t.name is 'select'
2252                         reconstruct_afe()
2253                         insert_html_element t
2254                         flag_frameset_ok = false
2255                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2256                                 ins_mode = ins_mode_in_select_in_table
2257                         else
2258                                 ins_mode = ins_mode_in_select
2259                         return
2260                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2261                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2262                                 open_els.shift()
2263                         reconstruct_afe()
2264                         insert_html_element t
2265                         return
2266 # this comment block implements the W3C spec
2267 #               if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2268 #                       if is_in_scope 'ruby', NS_HTML
2269 #                               generate_implied_end_tags()
2270 #                               unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2271 #                                       parse_error()
2272 #                       insert_html_element t
2273 #                       return
2274 #               if t.type is TYPE_START_TAG and t.name is 'rt'
2275 #                       if is_in_scope 'ruby', NS_HTML
2276 #                               generate_implied_end_tags 'rtc' # arg is exception
2277 #                               unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2278 #                                       parse_error()
2279 #                       insert_html_element t
2280 #                       return
2281 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2282                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2283                         if is_in_scope 'ruby', NS_HTML
2284                                 generate_implied_end_tags()
2285                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2286                                         parse_error()
2287                         insert_html_element t
2288                         return
2289                 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2290                         if is_in_scope 'ruby', NS_HTML
2291                                 generate_implied_end_tags 'rtc'
2292                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2293                                         parse_error()
2294                         insert_html_element t
2295                         return
2296 # end WHATWG chunk
2297                 if t.type is TYPE_START_TAG and t.name is 'math'
2298                         reconstruct_afe()
2299                         adjust_mathml_attributes t
2300                         adjust_foreign_attributes t
2301                         insert_foreign_element t, NS_MATHML
2302                         if t.flag 'self-closing'
2303                                 open_els.shift()
2304                                 t.acknowledge_self_closing()
2305                         return
2306                 if t.type is TYPE_START_TAG and t.name is 'svg'
2307                         reconstruct_afe()
2308                         adjust_svg_attributes t
2309                         adjust_foreign_attributes t
2310                         insert_foreign_element t, NS_SVG
2311                         if t.flag 'self-closing'
2312                                 open_els.shift()
2313                                 t.acknowledge_self_closing()
2314                         return
2315                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2316                         parse_error()
2317                         return
2318                 if t.type is TYPE_START_TAG # any other start tag
2319                         reconstruct_afe()
2320                         insert_html_element t
2321                         return
2322                 if t.type is TYPE_END_TAG # any other end tag
2323                         in_body_any_other_end_tag t.name
2324                         return
2325                 return
2326
2327         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2328         ins_mode_text = (t) ->
2329                 if t.type is TYPE_TEXT
2330                         insert_character t
2331                         return
2332                 if t.type is TYPE_EOF
2333                         parse_error()
2334                         if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2335                                 open_els[0].flag 'already started', true
2336                         open_els.shift()
2337                         ins_mode = original_ins_mode
2338                         process_token t
2339                         return
2340                 if t.type is TYPE_END_TAG and t.name is 'script'
2341                         open_els.shift()
2342                         ins_mode = original_ins_mode
2343                         # fixfull the spec seems to assume that I'm going to run the script
2344                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2345                         return
2346                 if t.type is TYPE_END_TAG
2347                         open_els.shift()
2348                         ins_mode = original_ins_mode
2349                         return
2350                 console.log 'warning: end of ins_mode_text reached'
2351
2352         # the functions below implement the tokenizer stats described here:
2353         # http://www.w3.org/TR/html5/syntax.html#tokenization
2354
2355         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2356         ins_mode_in_table_else = (t) ->
2357                 parse_error()
2358                 flag_foster_parenting = true
2359                 ins_mode_in_body t
2360                 flag_foster_parenting = false
2361                 return
2362         ins_mode_in_table = (t) ->
2363                 switch t.type
2364                         when TYPE_TEXT
2365                                 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2366                                         pending_table_character_tokens = []
2367                                         original_ins_mode = ins_mode
2368                                         ins_mode = ins_mode_in_table_text
2369                                         process_token t
2370                                 else
2371                                         ins_mode_in_table_else t
2372                         when TYPE_COMMENT
2373                                 insert_comment t
2374                         when TYPE_DOCTYPE
2375                                 parse_error()
2376                         when TYPE_START_TAG
2377                                 switch t.name
2378                                         when 'caption'
2379                                                 clear_stack_to_table_context()
2380                                                 afe_push_marker()
2381                                                 insert_html_element t
2382                                                 ins_mode = ins_mode_in_caption
2383                                         when 'colgroup'
2384                                                 clear_stack_to_table_context()
2385                                                 insert_html_element t
2386                                                 ins_mode = ins_mode_in_column_group
2387                                         when 'col'
2388                                                 clear_stack_to_table_context()
2389                                                 insert_html_element new_open_tag 'colgroup'
2390                                                 ins_mode = ins_mode_in_column_group
2391                                                 process_token t
2392                                         when 'tbody', 'tfoot', 'thead'
2393                                                 clear_stack_to_table_context()
2394                                                 insert_html_element t
2395                                                 ins_mode = ins_mode_in_table_body
2396                                         when 'td', 'th', 'tr'
2397                                                 clear_stack_to_table_context()
2398                                                 insert_html_element new_open_tag 'tbody'
2399                                                 ins_mode = ins_mode_in_table_body
2400                                                 process_token t
2401                                         when 'table'
2402                                                 parse_error()
2403                                                 if is_in_table_scope 'table', NS_HTML
2404                                                         loop
2405                                                                 el = open_els.shift()
2406                                                                 if el.name is 'table' and el.namespace is NS_HTML
2407                                                                         break
2408                                                         reset_ins_mode()
2409                                                         process_token t
2410                                         when 'style', 'script', 'template'
2411                                                 ins_mode_in_head t
2412                                         when 'input'
2413                                                 unless is_input_hidden_tok t
2414                                                         ins_mode_in_table_else t
2415                                                 else
2416                                                         parse_error()
2417                                                         el = insert_html_element t
2418                                                         open_els.shift()
2419                                                         t.acknowledge_self_closing()
2420                                         when 'form'
2421                                                 parse_error()
2422                                                 if form_element_pointer?
2423                                                         return
2424                                                 if template_tag_is_open()
2425                                                         return
2426                                                 form_element_pointer = insert_html_element t
2427                                                 open_els.shift()
2428                                         else
2429                                                 ins_mode_in_table_else t
2430                         when TYPE_END_TAG
2431                                 switch t.name
2432                                         when 'table'
2433                                                 if is_in_table_scope 'table', NS_HTML
2434                                                         loop
2435                                                                 el = open_els.shift()
2436                                                                 if el.name is 'table' and el.namespace is NS_HTML
2437                                                                         break
2438                                                         reset_ins_mode()
2439                                                 else
2440                                                         parse_error()
2441                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2442                                                 parse_error()
2443                                         when 'template'
2444                                                 ins_mode_in_head t
2445                                         else
2446                                                 ins_mode_in_table_else t
2447                         when TYPE_EOF
2448                                 ins_mode_in_body t
2449                         else
2450                                 ins_mode_in_table_else t
2451
2452
2453         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2454         ins_mode_in_table_text = (t) ->
2455                 if t.type is TYPE_TEXT and t.text is "\u0000"
2456                         # from javascript?
2457                         parse_error()
2458                         return
2459                 if t.type is TYPE_TEXT
2460                         pending_table_character_tokens.push t
2461                         return
2462                 # Anything else
2463                 all_space = true
2464                 for old in pending_table_character_tokens
2465                         unless is_space_tok old
2466                                 all_space = false
2467                                 break
2468                 if all_space
2469                         for old in pending_table_character_tokens
2470                                 insert_character old
2471                 else
2472                         for old in pending_table_character_tokens
2473                                 ins_mode_in_table_else old
2474                 pending_table_character_tokens = []
2475                 ins_mode = original_ins_mode
2476                 process_token t
2477
2478         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2479         ins_mode_in_caption = (t) ->
2480                 if t.type is TYPE_END_TAG and t.name is 'caption'
2481                         if is_in_table_scope 'caption', NS_HTML
2482                                 generate_implied_end_tags()
2483                                 if open_els[0].name isnt 'caption'
2484                                         parse_error()
2485                                 loop
2486                                         el = open_els.shift()
2487                                         if el.name is 'caption' and el.namespace is NS_HTML
2488                                                 break
2489                                 clear_afe_to_marker()
2490                                 ins_mode = ins_mode_in_table
2491                         else
2492                                 parse_error()
2493                                 # fragment case
2494                         return
2495                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2496                         parse_error()
2497                         if is_in_table_scope 'caption', NS_HTML
2498                                 loop
2499                                         el = open_els.shift()
2500                                         if el.name is 'caption' and el.namespace is NS_HTML
2501                                                 break
2502                                 clear_afe_to_marker()
2503                                 ins_mode = ins_mode_in_table
2504                                 process_token t
2505                         # else fragment case
2506                         return
2507                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2508                         parse_error()
2509                         return
2510                 # Anything else
2511                 ins_mode_in_body t
2512
2513         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2514         ins_mode_in_column_group = (t) ->
2515                 if is_space_tok t
2516                         insert_character t
2517                         return
2518                 if t.type is TYPE_COMMENT
2519                         insert_comment t
2520                         return
2521                 if t.type is TYPE_DOCTYPE
2522                         parse_error()
2523                         return
2524                 if t.type is TYPE_START_TAG and t.name is 'html'
2525                         ins_mode_in_body t
2526                         return
2527                 if t.type is TYPE_START_TAG and t.name is 'col'
2528                         el = insert_html_element t
2529                         open_els.shift()
2530                         t.acknowledge_self_closing()
2531                         return
2532                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2533                         if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2534                                 open_els.shift()
2535                                 ins_mode = ins_mode_in_table
2536                         else
2537                                 parse_error()
2538                         return
2539                 if t.type is TYPE_END_TAG and t.name is 'col'
2540                         parse_error()
2541                         return
2542                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2543                         ins_mode_in_head t
2544                         return
2545                 if t.type is TYPE_EOF
2546                         ins_mode_in_body t
2547                         return
2548                 # Anything else
2549                 if open_els[0].name isnt 'colgroup'
2550                         parse_error()
2551                         return
2552                 open_els.shift()
2553                 ins_mode = ins_mode_in_table
2554                 process_token t
2555                 return
2556
2557         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2558         ins_mode_in_table_body = (t) ->
2559                 if t.type is TYPE_START_TAG and t.name is 'tr'
2560                         clear_stack_to_table_body_context()
2561                         insert_html_element t
2562                         ins_mode = ins_mode_in_row
2563                         return
2564                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2565                         parse_error()
2566                         clear_stack_to_table_body_context()
2567                         insert_html_element new_open_tag 'tr'
2568                         ins_mode = ins_mode_in_row
2569                         process_token t
2570                         return
2571                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2572                         unless is_in_table_scope t.name, NS_HTML
2573                                 parse_error()
2574                                 return
2575                         clear_stack_to_table_body_context()
2576                         open_els.shift()
2577                         ins_mode = ins_mode_in_table
2578                         return
2579                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2580                         has = false
2581                         for el in open_els
2582                                 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2583                                         has = true
2584                                         break
2585                                 if table_scopers[el.name] is el.namespace
2586                                         break
2587                         if !has
2588                                 parse_error()
2589                                 return
2590                         clear_stack_to_table_body_context()
2591                         open_els.shift()
2592                         ins_mode = ins_mode_in_table
2593                         process_token t
2594                         return
2595                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2596                         parse_error()
2597                         return
2598                 # Anything else
2599                 ins_mode_in_table t
2600
2601         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2602         ins_mode_in_row = (t) ->
2603                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2604                         clear_stack_to_table_row_context()
2605                         insert_html_element t
2606                         ins_mode = ins_mode_in_cell
2607                         afe_push_marker()
2608                         return
2609                 if t.type is TYPE_END_TAG and t.name is 'tr'
2610                         if is_in_table_scope 'tr', NS_HTML
2611                                 clear_stack_to_table_row_context()
2612                                 open_els.shift()
2613                                 ins_mode = ins_mode_in_table_body
2614                         else
2615                                 parse_error()
2616                         return
2617                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2618                         if is_in_table_scope 'tr', NS_HTML
2619                                 clear_stack_to_table_row_context()
2620                                 open_els.shift()
2621                                 ins_mode = ins_mode_in_table_body
2622                                 process_token t
2623                         else
2624                                 parse_error()
2625                         return
2626                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2627                         if is_in_table_scope t.name, NS_HTML
2628                                 if is_in_table_scope 'tr', NS_HTML
2629                                         clear_stack_to_table_row_context()
2630                                         open_els.shift()
2631                                         ins_mode = ins_mode_in_table_body
2632                                         process_token t
2633                         else
2634                                 parse_error()
2635                         return
2636                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2637                         parse_error()
2638                         return
2639                 # Anything else
2640                 ins_mode_in_table t
2641
2642         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2643         close_the_cell = ->
2644                 generate_implied_end_tags()
2645                 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2646                         parse_error()
2647                 loop
2648                         el = open_els.shift()
2649                         if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2650                                 break
2651                 clear_afe_to_marker()
2652                 ins_mode = ins_mode_in_row
2653
2654         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2655         ins_mode_in_cell = (t) ->
2656                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2657                         if is_in_table_scope t.name, NS_HTML
2658                                 generate_implied_end_tags()
2659                                 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2660                                         parse_error()
2661                                 loop
2662                                         el = open_els.shift()
2663                                         if el.name is t.name and el.namespace is NS_HTML
2664                                                 break
2665                                 clear_afe_to_marker()
2666                                 ins_mode = ins_mode_in_row
2667                         else
2668                                 parse_error()
2669                         return
2670                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2671                         has = false
2672                         for el in open_els
2673                                 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2674                                         has = true
2675                                         break
2676                                 if table_scopers[el.name] is el.namespace
2677                                         break
2678                         if !has
2679                                 parse_error()
2680                                 return
2681                         close_the_cell()
2682                         process_token t
2683                         return
2684                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2685                         parse_error()
2686                         return
2687                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2688                         if is_in_table_scope t.name, NS_HTML
2689                                 close_the_cell()
2690                                 process_token t
2691                         else
2692                                 parse_error()
2693                         return
2694                 # Anything Else
2695                 ins_mode_in_body t
2696
2697         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2698         ins_mode_in_select = (t) ->
2699                 if t.type is TYPE_TEXT and t.text is "\u0000"
2700                         parse_error()
2701                         return
2702                 if t.type is TYPE_TEXT
2703                         insert_character t
2704                         return
2705                 if t.type is TYPE_COMMENT
2706                         insert_comment t
2707                         return
2708                 if t.type is TYPE_DOCTYPE
2709                         parse_error()
2710                         return
2711                 if t.type is TYPE_START_TAG and t.name is 'html'
2712                         ins_mode_in_body t
2713                         return
2714                 if t.type is TYPE_START_TAG and t.name is 'option'
2715                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2716                                 open_els.shift()
2717                         insert_html_element t
2718                         return
2719                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2720                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2721                                 open_els.shift()
2722                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2723                                 open_els.shift()
2724                         insert_html_element t
2725                         return
2726                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2727                         if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2728                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2729                                         open_els.shift()
2730                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2731                                 open_els.shift()
2732                         else
2733                                 parse_error()
2734                         return
2735                 if t.type is TYPE_END_TAG and t.name is 'option'
2736                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2737                                 open_els.shift()
2738                         else
2739                                 parse_error()
2740                         return
2741                 if t.type is TYPE_END_TAG and t.name is 'select'
2742                         if is_in_select_scope 'select', NS_HTML
2743                                 loop
2744                                         el = open_els.shift()
2745                                         if el.name is 'select' and el.namespace is NS_HTML
2746                                                 break
2747                                 reset_ins_mode()
2748                         else
2749                                 parse_error()
2750                         return
2751                 if t.type is TYPE_START_TAG and t.name is 'select'
2752                         parse_error()
2753                         loop
2754                                 el = open_els.shift()
2755                                 if el.name is 'select' and el.namespace is NS_HTML
2756                                         break
2757                         reset_ins_mode()
2758                         # spec says that this is the same as </select> but it doesn't say
2759                         # to check scope first
2760                         return
2761                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2762                         parse_error()
2763                         if is_in_select_scope 'select', NS_HTML
2764                                 return
2765                         loop
2766                                 el = open_els.shift()
2767                                 if el.name is 'select' and el.namespace is NS_HTML
2768                                         break
2769                         reset_ins_mode()
2770                         process_token t
2771                         return
2772                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2773                         ins_mode_in_head t
2774                         return
2775                 if t.type is TYPE_EOF
2776                         ins_mode_in_body t
2777                         return
2778                 # Anything else
2779                 parse_error()
2780                 return
2781
2782         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2783         ins_mode_in_select_in_table = (t) ->
2784                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2785                         parse_error()
2786                         loop
2787                                 el = open_els.shift()
2788                                 if el.name is 'select' and el.namespace is NS_HTML
2789                                         break
2790                         reset_ins_mode()
2791                         process_token t
2792                         return
2793                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2794                         parse_error()
2795                         unless is_in_table_scope t.name, NS_HTML
2796                                 return
2797                         loop
2798                                 el = open_els.shift()
2799                                 if el.name is 'select' and el.namespace is NS_HTML
2800                                         break
2801                         reset_ins_mode()
2802                         process_token t
2803                         return
2804                 # Anything else
2805                 ins_mode_in_select t
2806                 return
2807
2808         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2809         ins_mode_in_template = (t) ->
2810                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2811                         ins_mode_in_body t
2812                         return
2813                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2814                         ins_mode_in_head t
2815                         return
2816                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2817                         template_ins_modes.shift()
2818                         template_ins_modes.unshift ins_mode_in_table
2819                         ins_mode = ins_mode_in_table
2820                         process_token t
2821                         return
2822                 if t.type is TYPE_START_TAG and t.name is 'col'
2823                         template_ins_modes.shift()
2824                         template_ins_modes.unshift ins_mode_in_column_group
2825                         ins_mode = ins_mode_in_column_group
2826                         process_token t
2827                         return
2828                 if t.type is TYPE_START_TAG and t.name is 'tr'
2829                         template_ins_modes.shift()
2830                         template_ins_modes.unshift ins_mode_in_table_body
2831                         ins_mode = ins_mode_in_table_body
2832                         process_token t
2833                         return
2834                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2835                         template_ins_modes.shift()
2836                         template_ins_modes.unshift ins_mode_in_row
2837                         ins_mode = ins_mode_in_row
2838                         process_token t
2839                         return
2840                 if t.type is TYPE_START_TAG
2841                         template_ins_modes.shift()
2842                         template_ins_modes.unshift ins_mode_in_body
2843                         ins_mode = ins_mode_in_body
2844                         process_token t
2845                         return
2846                 if t.type is TYPE_END_TAG
2847                         parse_error()
2848                         return
2849                 if t.type is TYPE_EOF
2850                         unless template_tag_is_open()
2851                                 stop_parsing()
2852                                 return
2853                         parse_error()
2854                         loop
2855                                 el = open_els.shift()
2856                                 if el.name is 'template' and el.namespace is NS_HTML
2857                                         break
2858                         clear_afe_to_marker()
2859                         template_ins_modes.shift()
2860                         reset_ins_mode()
2861                         process_token t
2862
2863         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2864         ins_mode_after_body = (t) ->
2865                 if is_space_tok t
2866                         ins_mode_in_body t
2867                         return
2868                 if t.type is TYPE_COMMENT
2869                         first = open_els[open_els.length - 1]
2870                         insert_comment t, [first, first.children.length]
2871                         return
2872                 if t.type is TYPE_DOCTYPE
2873                         parse_error()
2874                         return
2875                 if t.type is TYPE_START_TAG and t.name is 'html'
2876                         ins_mode_in_body t
2877                         return
2878                 if t.type is TYPE_END_TAG and t.name is 'html'
2879                         if flag_fragment_parsing
2880                                 parse_error()
2881                                 return
2882                         ins_mode = ins_mode_after_after_body
2883                         return
2884                 if t.type is TYPE_EOF
2885                         stop_parsing()
2886                         return
2887                 # Anything ELse
2888                 parse_error()
2889                 ins_mode = ins_mode_in_body
2890                 process_token t
2891
2892         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2893         ins_mode_in_frameset = (t) ->
2894                 if is_space_tok t
2895                         insert_character t
2896                         return
2897                 if t.type is TYPE_COMMENT
2898                         insert_comment t
2899                         return
2900                 if t.type is TYPE_DOCTYPE
2901                         parse_error()
2902                         return
2903                 if t.type is TYPE_START_TAG and t.name is 'html'
2904                         ins_mode_in_body t
2905                         return
2906                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2907                         insert_html_element t
2908                         return
2909                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2910                         if open_els.length is 1
2911                                 parse_error()
2912                                 return # fragment case
2913                         open_els.shift()
2914                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2915                                 ins_mode = ins_mode_after_frameset
2916                         return
2917                 if t.type is TYPE_START_TAG and t.name is 'frame'
2918                         insert_html_element t
2919                         open_els.shift()
2920                         t.acknowledge_self_closing()
2921                         return
2922                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2923                         ins_mode_in_head t
2924                         return
2925                 if t.type is TYPE_EOF
2926                         if open_els.length isnt 1
2927                                 parse_error()
2928                         stop_parsing()
2929                         return
2930                 # Anything else
2931                 parse_error()
2932                 return
2933
2934         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2935         ins_mode_after_frameset = (t) ->
2936                 if is_space_tok t
2937                         insert_character t
2938                         return
2939                 if t.type is TYPE_COMMENT
2940                         insert_comment t
2941                         return
2942                 if t.type is TYPE_DOCTYPE
2943                         parse_error()
2944                         return
2945                 if t.type is TYPE_START_TAG and t.name is 'html'
2946                         ins_mode_in_body t
2947                         return
2948                 if t.type is TYPE_END_TAG and t.name is 'html'
2949                         ins_mode = ins_mode_after_after_frameset
2950                         return
2951                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2952                         ins_mode_in_head t
2953                         return
2954                 if t.type is TYPE_EOF
2955                         stop_parsing()
2956                         return
2957                 # Anything else
2958                 parse_error()
2959                 return
2960
2961         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2962         ins_mode_after_after_body = (t) ->
2963                 if t.type is TYPE_COMMENT
2964                         insert_comment t, [doc, doc.children.length]
2965                         return
2966                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2967                         ins_mode_in_body t
2968                         return
2969                 if t.type is TYPE_EOF
2970                         stop_parsing()
2971                         return
2972                 # Anything else
2973                 parse_error()
2974                 ins_mode = ins_mode_in_body
2975                 process_token t
2976                 return
2977
2978         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2979         ins_mode_after_after_frameset = (t) ->
2980                 if t.type is TYPE_COMMENT
2981                         insert_comment t, [doc, doc.children.length]
2982                         return
2983                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2984                         ins_mode_in_body t
2985                         return
2986                 if t.type is TYPE_EOF
2987                         stop_parsing()
2988                         return
2989                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2990                         ins_mode_in_head t
2991                         return
2992                 # Anything else
2993                 parse_error()
2994                 return
2995
2996         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2997         has_color_face_or_size = (t) ->
2998                 for a in t.attrs_a
2999                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3000                                 return true
3001                 return false
3002         in_foreign_content_end_script = ->
3003                 open_els.shift()
3004                 # fixfull
3005                 return
3006         in_foreign_content_other_start = (t) ->
3007                 acn = adjusted_current_node()
3008                 if acn.namespace is NS_MATHML
3009                         adjust_mathml_attributes t
3010                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3011                         t.name = svg_name_fixes[t.name]
3012                 if acn.namespace is NS_SVG
3013                         adjust_svg_attributes t
3014                 adjust_foreign_attributes t
3015                 insert_foreign_element t, acn.namespace
3016                 if t.flag 'self-closing'
3017                         if t.name is 'script'
3018                                 t.acknowledge_self_closing()
3019                                 in_foreign_content_end_script()
3020                                 # fixfull
3021                         else
3022                                 open_els.shift()
3023                                 t.acknowledge_self_closing()
3024                 return
3025         in_foreign_content = (t) ->
3026                 if t.type is TYPE_TEXT and t.text is "\u0000"
3027                         parse_error()
3028                         insert_character new_character_token "\ufffd"
3029                         return
3030                 if is_space_tok t
3031                         insert_character t
3032                         return
3033                 if t.type is TYPE_TEXT
3034                         flag_frameset_ok = false
3035                         insert_character t
3036                         return
3037                 if t.type is TYPE_COMMENT
3038                         insert_comment t
3039                         return
3040                 if t.type is TYPE_DOCTYPE
3041                         parse_error()
3042                         return
3043                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3044                         parse_error()
3045                         if flag_fragment_parsing
3046                                 in_foreign_content_other_start t
3047                                 return
3048                         loop # is this safe?
3049                                 open_els.shift()
3050                                 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3051                                         break
3052                         process_token t
3053                         return
3054                 if t.type is TYPE_START_TAG
3055                         in_foreign_content_other_start t
3056                         return
3057                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3058                         in_foreign_content_end_script()
3059                         return
3060                 if t.type is TYPE_END_TAG
3061                         i = 0
3062                         node = open_els[i]
3063                         if node.name.toLowerCase() isnt t.name
3064                                 parse_error()
3065                         loop
3066                                 if node is open_els[open_els.length - 1]
3067                                         return
3068                                 if node.name.toLowerCase() is t.name
3069                                         loop
3070                                                 el = open_els.shift()
3071                                                 if el is node
3072                                                         return
3073                                 i += 1
3074                                 node = open_els[i]
3075                                 if node.namespace is NS_HTML
3076                                         break
3077                         ins_mode t # explicitly call HTML insertion mode
3078
3079
3080         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3081         tok_state_data = ->
3082                 switch c = txt.charAt(cur++)
3083                         when '&'
3084                                 return new_text_node parse_character_reference()
3085                         when '<'
3086                                 tok_state = tok_state_tag_open
3087                         when "\u0000"
3088                                 parse_error()
3089                                 return new_text_node "\ufffd"
3090                         when '' # EOF
3091                                 return new_eof_token()
3092                         else
3093                                 return new_text_node c
3094                 return null
3095
3096         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3097         # not needed: tok_state_character_reference_in_data = ->
3098         # just call parse_character_reference()
3099
3100         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3101         tok_state_rcdata = ->
3102                 switch c = txt.charAt(cur++)
3103                         when '&'
3104                                 return new_text_node parse_character_reference()
3105                         when '<'
3106                                 tok_state = tok_state_rcdata_less_than_sign
3107                         when "\u0000"
3108                                 parse_error()
3109                                 return new_character_token "\ufffd"
3110                         when '' # EOF
3111                                 return new_eof_token()
3112                         else
3113                                 return new_character_token c
3114                 return null
3115
3116         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3117         # not needed: tok_state_character_reference_in_rcdata = ->
3118         # just call parse_character_reference()
3119
3120         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3121         tok_state_rawtext = ->
3122                 switch c = txt.charAt(cur++)
3123                         when '<'
3124                                 tok_state = tok_state_rawtext_less_than_sign
3125                         when "\u0000"
3126                                 parse_error()
3127                                 return new_character_token "\ufffd"
3128                         when '' # EOF
3129                                 return new_eof_token()
3130                         else
3131                                 return new_character_token c
3132                 return null
3133
3134         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3135         tok_state_script_data = ->
3136                 switch c = txt.charAt(cur++)
3137                         when '<'
3138                                 tok_state = tok_state_script_data_less_than_sign
3139                         when "\u0000"
3140                                 parse_error()
3141                                 return new_character_token "\ufffd"
3142                         when '' # EOF
3143                                 return new_eof_token()
3144                         else
3145                                 return new_character_token c
3146                 return null
3147
3148         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3149         tok_state_plaintext = ->
3150                 switch c = txt.charAt(cur++)
3151                         when "\u0000"
3152                                 parse_error()
3153                                 return new_character_token "\ufffd"
3154                         when '' # EOF
3155                                 return new_eof_token()
3156                         else
3157                                 return new_character_token c
3158                 return null
3159
3160
3161         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3162         tok_state_tag_open = ->
3163                 c = txt.charAt(cur++)
3164                 if c is '!'
3165                         tok_state = tok_state_markup_declaration_open
3166                         return
3167                 if c is '/'
3168                         tok_state = tok_state_end_tag_open
3169                         return
3170                 if is_uc_alpha(c)
3171                         tok_cur_tag = new_open_tag c.toLowerCase()
3172                         tok_state = tok_state_tag_name
3173                         return
3174                 if is_lc_alpha(c)
3175                         tok_cur_tag = new_open_tag c
3176                         tok_state = tok_state_tag_name
3177                         return
3178                 if c is '?'
3179                         parse_error()
3180                         tok_cur_tag = new_comment_token '?' # FIXME right?
3181                         tok_state = tok_state_bogus_comment
3182                         return
3183                 # Anything else
3184                 parse_error()
3185                 tok_state = tok_state_data
3186                 cur -= 1 # we didn't parse/handle the char after <
3187                 return new_text_node '<'
3188
3189         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3190         tok_state_end_tag_open = ->
3191                 c = txt.charAt(cur++)
3192                 if is_uc_alpha(c)
3193                         tok_cur_tag = new_end_tag c.toLowerCase()
3194                         tok_state = tok_state_tag_name
3195                         return
3196                 if is_lc_alpha(c)
3197                         tok_cur_tag = new_end_tag c
3198                         tok_state = tok_state_tag_name
3199                         return
3200                 if c is '>'
3201                         parse_error()
3202                         tok_state = tok_state_data
3203                         return
3204                 if c is '' # EOF
3205                         parse_error()
3206                         tok_state = tok_state_data
3207                         return new_text_node '</'
3208                 # Anything else
3209                 parse_error()
3210                 tok_cur_tag = new_comment_token c
3211                 tok_state = tok_state_bogus_comment
3212                 return null
3213
3214         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3215         tok_state_tag_name = ->
3216                 switch c = txt.charAt(cur++)
3217                         when "\t", "\n", "\u000c", ' '
3218                                 tok_state = tok_state_before_attribute_name
3219                         when '/'
3220                                 tok_state = tok_state_self_closing_start_tag
3221                         when '>'
3222                                 tok_state = tok_state_data
3223                                 tmp = tok_cur_tag
3224                                 tok_cur_tag = null
3225                                 return tmp
3226                         when "\u0000"
3227                                 parse_error()
3228                                 tok_cur_tag.name += "\ufffd"
3229                         when '' # EOF
3230                                 parse_error()
3231                                 tok_state = tok_state_data
3232                         else
3233                                 if is_uc_alpha(c)
3234                                         tok_cur_tag.name += c.toLowerCase()
3235                                 else
3236                                         tok_cur_tag.name += c
3237                 return null
3238
3239         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3240         tok_state_rcdata_less_than_sign = ->
3241                 c = txt.charAt(cur++)
3242                 if c is '/'
3243                         temporary_buffer = ''
3244                         tok_state = tok_state_rcdata_end_tag_open
3245                         return null
3246                 # Anything else
3247                 tok_state = tok_state_rcdata
3248                 cur -= 1 # reconsume the input character
3249                 return new_character_token '<'
3250
3251         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3252         tok_state_rcdata_end_tag_open = ->
3253                 c = txt.charAt(cur++)
3254                 if is_uc_alpha(c)
3255                         tok_cur_tag = new_end_tag c.toLowerCase()
3256                         temporary_buffer += c
3257                         tok_state = tok_state_rcdata_end_tag_name
3258                         return null
3259                 if is_lc_alpha(c)
3260                         tok_cur_tag = new_end_tag c
3261                         temporary_buffer += c
3262                         tok_state = tok_state_rcdata_end_tag_name
3263                         return null
3264                 # Anything else
3265                 tok_state = tok_state_rcdata
3266                 cur -= 1 # reconsume the input character
3267                 return new_character_token "</" # fixfull separate these
3268
3269         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3270         is_appropriate_end_tag = (t) ->
3271                 # spec says to check against "the tag name of the last start tag to
3272                 # have been emitted from this tokenizer", but this is only called from
3273                 # the various "raw" states, so it's hopefully ok to assume that
3274                 # open_els[0].name will work instead TODO: verify this after the script
3275                 # data states are implemented
3276                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3277                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3278
3279         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3280         tok_state_rcdata_end_tag_name = ->
3281                 c = txt.charAt(cur++)
3282                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3283                         if is_appropriate_end_tag tok_cur_tag
3284                                 tok_state = tok_state_before_attribute_name
3285                                 return
3286                         # else fall through to "Anything else"
3287                 if c is '/'
3288                         if is_appropriate_end_tag tok_cur_tag
3289                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3290                                 return
3291                         # else fall through to "Anything else"
3292                 if c is '>'
3293                         if is_appropriate_end_tag tok_cur_tag
3294                                 tok_state = tok_state_data
3295                                 return tok_cur_tag
3296                         # else fall through to "Anything else"
3297                 if is_uc_alpha(c)
3298                         tok_cur_tag.name += c.toLowerCase()
3299                         temporary_buffer += c
3300                         return null
3301                 if is_lc_alpha(c)
3302                         tok_cur_tag.name += c
3303                         temporary_buffer += c
3304                         return null
3305                 # Anything else
3306                 tok_state = tok_state_rcdata
3307                 cur -= 1 # reconsume the input character
3308                 return new_character_token '</' + temporary_buffer # fixfull separate these
3309
3310         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3311         tok_state_rawtext_less_than_sign = ->
3312                 c = txt.charAt(cur++)
3313                 if c is '/'
3314                         temporary_buffer = ''
3315                         tok_state = tok_state_rawtext_end_tag_open
3316                         return null
3317                 # Anything else
3318                 tok_state = tok_state_rawtext
3319                 cur -= 1 # reconsume the input character
3320                 return new_character_token '<'
3321
3322         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3323         tok_state_rawtext_end_tag_open = ->
3324                 c = txt.charAt(cur++)
3325                 if is_uc_alpha(c)
3326                         tok_cur_tag = new_end_tag c.toLowerCase()
3327                         temporary_buffer += c
3328                         tok_state = tok_state_rawtext_end_tag_name
3329                         return null
3330                 if is_lc_alpha(c)
3331                         tok_cur_tag = new_end_tag c
3332                         temporary_buffer += c
3333                         tok_state = tok_state_rawtext_end_tag_name
3334                         return null
3335                 # Anything else
3336                 tok_state = tok_state_rawtext
3337                 cur -= 1 # reconsume the input character
3338                 return new_character_token "</" # fixfull separate these
3339
3340         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3341         tok_state_rawtext_end_tag_name = ->
3342                 c = txt.charAt(cur++)
3343                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3344                         if is_appropriate_end_tag tok_cur_tag
3345                                 tok_state = tok_state_before_attribute_name
3346                                 return
3347                         # else fall through to "Anything else"
3348                 if c is '/'
3349                         if is_appropriate_end_tag tok_cur_tag
3350                                 tok_state = tok_state_self_closing_start_tag
3351                                 return
3352                         # else fall through to "Anything else"
3353                 if c is '>'
3354                         if is_appropriate_end_tag tok_cur_tag
3355                                 tok_state = tok_state_data
3356                                 return tok_cur_tag
3357                         # else fall through to "Anything else"
3358                 if is_uc_alpha(c)
3359                         tok_cur_tag.name += c.toLowerCase()
3360                         temporary_buffer += c
3361                         return null
3362                 if is_lc_alpha(c)
3363                         tok_cur_tag.name += c
3364                         temporary_buffer += c
3365                         return null
3366                 # Anything else
3367                 tok_state = tok_state_rawtext
3368                 cur -= 1 # reconsume the input character
3369                 return new_character_token '</' + temporary_buffer # fixfull separate these
3370
3371         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3372         tok_state_script_data_less_than_sign = ->
3373                 c = txt.charAt(cur++)
3374                 if c is '/'
3375                         temporary_buffer = ''
3376                         tok_state = tok_state_script_data_end_tag_open
3377                         return
3378                 if c is '!'
3379                         tok_state = tok_state_script_data_escape_start
3380                         return new_character_token '<!' # fixfull split
3381                 # Anything else
3382                 tok_state = tok_state_script_data
3383                 cur -= 1 # Reconsume
3384                 return new_character_token '<'
3385
3386         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3387         tok_state_script_data_end_tag_open = ->
3388                 c = txt.charAt(cur++)
3389                 if is_uc_alpha(c)
3390                         tok_cur_tag = new_end_tag c.toLowerCase()
3391                         temporary_buffer += c
3392                         tok_state = tok_state_script_data_end_tag_name
3393                         return
3394                 if is_lc_alpha(c)
3395                         tok_cur_tag = new_end_tag c
3396                         temporary_buffer += c
3397                         tok_state = tok_state_script_data_end_tag_name
3398                         return
3399                 # Anything else
3400                 tok_state = tok_state_script_data
3401                 cur -= 1 # Reconsume
3402                 return new_character_token '</'
3403
3404         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3405         tok_state_script_data_end_tag_name = ->
3406                 c = txt.charAt(cur++)
3407                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3408                         if is_appropriate_end_tag tok_cur_tag
3409                                 tok_state = tok_state_before_attribute_name
3410                                 return
3411                         # fall through
3412                 if c is '/'
3413                         if is_appropriate_end_tag tok_cur_tag
3414                                 tok_state = tok_state_self_closing_start_tag
3415                                 return
3416                         # fall through
3417                 if c is '>'
3418                         if is_appropriate_end_tag tok_cur_tag
3419                                 tok_state = tok_state_data
3420                                 return tok_cur_tag
3421                         # fall through
3422                 if is_uc_alpha(c)
3423                         tok_cur_tag.name += c.toLowerCase()
3424                         temporary_buffer += c
3425                         return
3426                 if is_lc_alpha(c)
3427                         tok_cur_tag.name += c
3428                         temporary_buffer += c
3429                         return
3430                 # Anything else
3431                 tok_state = tok_state_script_data
3432                 cur -= 1 # Reconsume
3433                 return new_character_token "</#{temporary_buffer}" # fixfull split
3434
3435         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3436         tok_state_script_data_escape_start = ->
3437                 c = txt.charAt(cur++)
3438                 if c is '-'
3439                         tok_state = tok_state_script_data_escape_start_dash
3440                         return new_character_token '-'
3441                 # Anything else
3442                 tok_state = tok_state_script_data
3443                 cur -= 1 # Reconsume
3444                 return
3445
3446         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3447         tok_state_script_data_escape_start_dash = ->
3448                 c = txt.charAt(cur++)
3449                 if c is '-'
3450                         tok_state = tok_state_script_data_escaped_dash_dash
3451                         return new_character_token '-'
3452                 # Anything else
3453                 tok_state = tok_state_script_data
3454                 cur -= 1 # Reconsume
3455                 return
3456
3457         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3458         tok_state_script_data_escaped = ->
3459                 c = txt.charAt(cur++)
3460                 if c is '-'
3461                         tok_state = tok_state_script_data_escaped_dash
3462                         return new_character_token '-'
3463                 if c is '<'
3464                         tok_state = tok_state_script_data_escaped_less_than_sign
3465                         return
3466                 if c is "\u0000"
3467                         parse_error()
3468                         return new_character_token "\ufffd"
3469                 if c is '' # EOF
3470                         tok_state = tok_state_data
3471                         parse_error()
3472                         cur -= 1 # Reconsume
3473                         return
3474                 # Anything else
3475                 return new_character_token c
3476
3477         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3478         tok_state_script_data_escaped_dash = ->
3479                 c = txt.charAt(cur++)
3480                 if c is '-'
3481                         tok_state = tok_state_script_data_escaped_dash_dash
3482                         return new_character_token '-'
3483                 if c is '<'
3484                         tok_state = tok_state_script_data_escaped_less_than_sign
3485                         return
3486                 if c is "\u0000"
3487                         parse_error()
3488                         tok_state = tok_state_script_data_escaped
3489                         return new_character_token "\ufffd"
3490                 if c is '' # EOF
3491                         tok_state = tok_state_data
3492                         parse_error()
3493                         cur -= 1 # Reconsume
3494                         return
3495                 # Anything else
3496                 tok_state = tok_state_script_data_escaped
3497                 return new_character_token c
3498
3499         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3500         tok_state_script_data_escaped_dash_dash = ->
3501                 c = txt.charAt(cur++)
3502                 if c is '-'
3503                         return new_character_token '-'
3504                 if c is '<'
3505                         tok_state = tok_state_script_data_escaped_less_than_sign
3506                         return
3507                 if c is '>'
3508                         tok_state = tok_state_script_data
3509                         return new_character_token '>'
3510                 if c is "\u0000"
3511                         parse_error()
3512                         tok_state = tok_state_script_data_escaped
3513                         return new_character_token "\ufffd"
3514                 if c is '' # EOF
3515                         parse_error()
3516                         tok_state = tok_state_data
3517                         cur -= 1 # Reconsume
3518                         return
3519                 # Anything else
3520                 tok_state = tok_state_script_data_escaped
3521                 return new_character_token c
3522
3523         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3524         tok_state_script_data_escaped_less_than_sign = ->
3525                 c = txt.charAt(cur++)
3526                 if c is '/'
3527                         temporary_buffer = ''
3528                         tok_state = tok_state_script_data_escaped_end_tag_open
3529                         return
3530                 if is_uc_alpha(c)
3531                         temporary_buffer = c.toLowerCase() # yes, really
3532                         tok_state = tok_state_script_data_double_escape_start
3533                         return new_character_token "<#{c}" # fixfull split
3534                 if is_lc_alpha(c)
3535                         temporary_buffer = c
3536                         tok_state = tok_state_script_data_double_escape_start
3537                         return new_character_token "<#{c}" # fixfull split
3538                 # Anything else
3539                 tok_state = tok_state_script_data_escaped
3540                 cur -= 1 # Reconsume
3541                 return new_character_token '<'
3542
3543         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3544         tok_state_script_data_escaped_end_tag_open = ->
3545                 c = txt.charAt(cur++)
3546                 if is_uc_alpha(c)
3547                         tok_cur_tag = new_end_tag c.toLowerCase()
3548                         temporary_buffer += c
3549                         tok_state = tok_state_script_data_escaped_end_tag_name
3550                         return
3551                 if is_lc_alpha(c)
3552                         tok_cur_tag = new_end_tag c
3553                         temporary_buffer += c
3554                         tok_state = tok_state_script_data_escaped_end_tag_name
3555                         return
3556                 # Anything else
3557                 tok_state = tok_state_script_data_escaped
3558                 cur -= 1 # Reconsume
3559                 return new_character_token '</' # fixfull split
3560
3561         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3562         tok_state_script_data_escaped_end_tag_name = ->
3563                 c = txt.charAt(cur++)
3564                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3565                         if is_appropriate_end_tag tok_cur_tag
3566                                 tok_state = tok_state_before_attribute_name
3567                                 return
3568                         # fall through
3569                 if c is '/'
3570                         if is_appropriate_end_tag tok_cur_tag
3571                                 tok_state = tok_state_self_closing_start_tag
3572                                 return
3573                         # fall through
3574                 if c is '>'
3575                         if is_appropriate_end_tag tok_cur_tag
3576                                 tok_state = tok_state_data
3577                                 return tok_cur_tag
3578                         # fall through
3579                 if is_uc_alpha(c)
3580                         tok_cur_tag.name += c.toLowerCase()
3581                         temporary_buffer += c.toLowerCase()
3582                         return
3583                 if is_lc_alpha(c)
3584                         tok_cur_tag.name += c
3585                         temporary_buffer += c.toLowerCase()
3586                         return
3587                 # Anything else
3588                 tok_state = tok_state_script_data_escaped
3589                 cur -= 1 # Reconsume
3590                 return new_character_token "</#{temporary_buffer}" # fixfull split
3591
3592         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3593         tok_state_script_data_double_escape_start = ->
3594                 c = txt.charAt(cur++)
3595                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3596                         if temporary_buffer is 'script'
3597                                 tok_state = tok_state_script_data_double_escaped
3598                         else
3599                                 tok_state = tok_state_script_data_escaped
3600                         return new_character_token c
3601                 if is_uc_alpha(c)
3602                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3603                         return new_character_token c
3604                 if is_lc_alpha(c)
3605                         temporary_buffer += c
3606                         return new_character_token c
3607                 # Anything else
3608                 tok_state = tok_state_script_data_escaped
3609                 cur -= 1 # Reconsume
3610                 return
3611
3612         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3613         tok_state_script_data_double_escaped = ->
3614                 c = txt.charAt(cur++)
3615                 if c is '-'
3616                         tok_state = tok_state_script_data_double_escaped_dash
3617                         return new_character_token '-'
3618                 if c is '<'
3619                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3620                         return new_character_token '<'
3621                 if c is "\u0000"
3622                         parse_error()
3623                         return new_character_token "\ufffd"
3624                 if c is '' # EOF
3625                         parse_error()
3626                         tok_state = tok_state_data
3627                         cur -= 1 # Reconsume
3628                         return
3629                 # Anything else
3630                 return new_character_token c
3631
3632         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3633         tok_state_script_data_double_escaped_dash = ->
3634                 c = txt.charAt(cur++)
3635                 if c is '-'
3636                         tok_state = tok_state_script_data_double_escaped_dash_dash
3637                         return new_character_token '-'
3638                 if c is '<'
3639                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3640                         return new_character_token '<'
3641                 if c is "\u0000"
3642                         parse_error()
3643                         tok_state = tok_state_script_data_double_escaped
3644                         return new_character_token "\ufffd"
3645                 if c is '' # EOF
3646                         parse_error()
3647                         tok_state = tok_state_data
3648                         cur -= 1 # Reconsume
3649                         return
3650                 # Anything else
3651                 tok_state = tok_state_script_data_double_escaped
3652                 return new_character_token c
3653
3654         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3655         tok_state_script_data_double_escaped_dash_dash = ->
3656                 c = txt.charAt(cur++)
3657                 if c is '-'
3658                         return new_character_token '-'
3659                 if c is '<'
3660                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3661                         return new_character_token '<'
3662                 if c is '>'
3663                         tok_state = tok_state_script_data
3664                         return new_character_token '>'
3665                 if c is "\u0000"
3666                         parse_error()
3667                         tok_state = tok_state_script_data_double_escaped
3668                         return new_character_token "\ufffd"
3669                 if c is '' # EOF
3670                         parse_error()
3671                         tok_state = tok_state_data
3672                         cur -= 1 # Reconsume
3673                         return
3674                 # Anything else
3675                 tok_state = tok_state_script_data_double_escaped
3676                 return new_character_token c
3677
3678         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3679         tok_state_script_data_double_escaped_less_than_sign = ->
3680                 c = txt.charAt(cur++)
3681                 if c is '/'
3682                         temporary_buffer = ''
3683                         tok_state = tok_state_script_data_double_escape_end
3684                         return new_character_token '/'
3685                 # Anything else
3686                 tok_state = tok_state_script_data_double_escaped
3687                 cur -= 1 # Reconsume
3688                 return
3689
3690         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3691         tok_state_script_data_double_escape_end = ->
3692                 c = txt.charAt(cur++)
3693                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3694                         if temporary_buffer is 'script'
3695                                 tok_state = tok_state_script_data_escaped
3696                         else
3697                                 tok_state = tok_state_script_data_double_escaped
3698                         return new_character_token c
3699                 if is_uc_alpha(c)
3700                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3701                         return new_character_token c
3702                 if is_lc_alpha(c)
3703                         temporary_buffer += c
3704                         return new_character_token c
3705                 # Anything else
3706                 tok_state = tok_state_script_data_double_escaped
3707                 cur -= 1 # Reconsume
3708                 return
3709
3710         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3711         tok_state_before_attribute_name = ->
3712                 attr_name = null
3713                 switch c = txt.charAt(cur++)
3714                         when "\t", "\n", "\u000c", ' '
3715                                 return null
3716                         when '/'
3717                                 tok_state = tok_state_self_closing_start_tag
3718                                 return null
3719                         when '>'
3720                                 tok_state = tok_state_data
3721                                 tmp = tok_cur_tag
3722                                 tok_cur_tag = null
3723                                 return tmp
3724                         when "\u0000"
3725                                 parse_error()
3726                                 attr_name = "\ufffd"
3727                         when '"', "'", '<', '='
3728                                 parse_error()
3729                                 attr_name = c
3730                         when '' # EOF
3731                                 parse_error()
3732                                 tok_state = tok_state_data
3733                         else
3734                                 if is_uc_alpha(c)
3735                                         attr_name = c.toLowerCase()
3736                                 else
3737                                         attr_name = c
3738                 if attr_name?
3739                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3740                         tok_state = tok_state_attribute_name
3741                 return null
3742
3743         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3744         tok_state_attribute_name = ->
3745                 switch c = txt.charAt(cur++)
3746                         when "\t", "\n", "\u000c", ' '
3747                                 tok_state = tok_state_after_attribute_name
3748                         when '/'
3749                                 tok_state = tok_state_self_closing_start_tag
3750                         when '='
3751                                 tok_state = tok_state_before_attribute_value
3752                         when '>'
3753                                 tok_state = tok_state_data
3754                                 tmp = tok_cur_tag
3755                                 tok_cur_tag = null
3756                                 return tmp
3757                         when "\u0000"
3758                                 parse_error()
3759                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3760                         when '"', "'", '<'
3761                                 parse_error()
3762                                 tok_cur_tag.attrs_a[0][0] += c
3763                         when '' # EOF
3764                                 parse_error()
3765                                 tok_state = tok_state_data
3766                         else
3767                                 if is_uc_alpha(c)
3768                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3769                                 else
3770                                         tok_cur_tag.attrs_a[0][0] += c
3771                 return null
3772
3773         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3774         tok_state_after_attribute_name = ->
3775                 c = txt.charAt(cur++)
3776                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3777                         return
3778                 if c is '/'
3779                         tok_state = tok_state_self_closing_start_tag
3780                         return
3781                 if c is '='
3782                         tok_state = tok_state_before_attribute_value
3783                         return
3784                 if c is '>'
3785                         tok_state = tok_state_data
3786                         return
3787                 if is_uc_alpha(c)
3788                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3789                         tok_state = tok_state_attribute_name
3790                         return
3791                 if c is "\u0000"
3792                         parse_error()
3793                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3794                         tok_state = tok_state_attribute_name
3795                         return
3796                 if c is '' # EOF
3797                         parse_error()
3798                         tok_state = tok_state_data
3799                         cur -= 1 # reconsume
3800                         return
3801                 if c is '"' or c is "'" or c is '<'
3802                         parse_error()
3803                         # fall through to Anything else
3804                 # Anything else
3805                 tok_cur_tag.attrs_a.unshift [c, '']
3806                 tok_state = tok_state_attribute_name
3807
3808         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3809         tok_state_before_attribute_value = ->
3810                 switch c = txt.charAt(cur++)
3811                         when "\t", "\n", "\u000c", ' '
3812                                 return null
3813                         when '"'
3814                                 tok_state = tok_state_attribute_value_double_quoted
3815                         when '&'
3816                                 tok_state = tok_state_attribute_value_unquoted
3817                                 cur -= 1
3818                         when "'"
3819                                 tok_state = tok_state_attribute_value_single_quoted
3820                         when "\u0000"
3821                                 # Parse error
3822                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3823                                 tok_state = tok_state_attribute_value_unquoted
3824                         when '>'
3825                                 # Parse error
3826                                 tok_state = tok_state_data
3827                                 tmp = tok_cur_tag
3828                                 tok_cur_tag = null
3829                                 return tmp
3830                         when '' # EOF
3831                                 parse_error()
3832                                 tok_state = tok_state_data
3833                         else
3834                                 tok_cur_tag.attrs_a[0][1] += c
3835                                 tok_state = tok_state_attribute_value_unquoted
3836                 return null
3837
3838         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3839         tok_state_attribute_value_double_quoted = ->
3840                 switch c = txt.charAt(cur++)
3841                         when '"'
3842                                 tok_state = tok_state_after_attribute_value_quoted
3843                         when '&'
3844                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3845                         when "\u0000"
3846                                 # Parse error
3847                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3848                         when '' # EOF
3849                                 parse_error()
3850                                 tok_state = tok_state_data
3851                         else
3852                                 tok_cur_tag.attrs_a[0][1] += c
3853                 return null
3854
3855         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3856         tok_state_attribute_value_single_quoted = ->
3857                 switch c = txt.charAt(cur++)
3858                         when "'"
3859                                 tok_state = tok_state_after_attribute_value_quoted
3860                         when '&'
3861                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3862                         when "\u0000"
3863                                 # Parse error
3864                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3865                         when '' # EOF
3866                                 parse_error()
3867                                 tok_state = tok_state_data
3868                         else
3869                                 tok_cur_tag.attrs_a[0][1] += c
3870                 return null
3871
3872         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3873         tok_state_attribute_value_unquoted = ->
3874                 switch c = txt.charAt(cur++)
3875                         when "\t", "\n", "\u000c", ' '
3876                                 tok_state = tok_state_before_attribute_name
3877                         when '&'
3878                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3879                         when '>'
3880                                 tok_state = tok_state_data
3881                                 tmp = tok_cur_tag
3882                                 tok_cur_tag = null
3883                                 return tmp
3884                         when "\u0000"
3885                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3886                         when '' # EOF
3887                                 parse_error()
3888                                 tok_state = tok_state_data
3889                         else
3890                                 # Parse Error if ', <, = or ` (backtick)
3891                                 tok_cur_tag.attrs_a[0][1] += c
3892                 return null
3893
3894         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3895         tok_state_after_attribute_value_quoted = ->
3896                 switch c = txt.charAt(cur++)
3897                         when "\t", "\n", "\u000c", ' '
3898                                 tok_state = tok_state_before_attribute_name
3899                         when '/'
3900                                 tok_state = tok_state_self_closing_start_tag
3901                         when '>'
3902                                 tok_state = tok_state_data
3903                                 tmp = tok_cur_tag
3904                                 tok_cur_tag = null
3905                                 return tmp
3906                         when '' # EOF
3907                                 parse_error()
3908                                 tok_state = tok_state_data
3909                         else
3910                                 # Parse Error
3911                                 tok_state = tok_state_before_attribute_name
3912                                 cur -= 1 # we didn't handle that char
3913                 return null
3914
3915         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3916         tok_state_self_closing_start_tag = ->
3917                 c = txt.charAt(cur++)
3918                 if c is '>'
3919                         tok_cur_tag.flag 'self-closing', true
3920                         tok_state = tok_state_data
3921                         return tok_cur_tag
3922                 if c is ''
3923                         parse_error()
3924                         tok_state = tok_state_data
3925                         cur -= 1 # Reconsume
3926                         return
3927                 # Anything else
3928                 parse_error()
3929                 tok_state = tok_state_before_attribute_name
3930                 cur -= 1 # Reconsume
3931                 return
3932
3933         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3934         # WARNING: put a comment token in tok_cur_tag before setting this state
3935         tok_state_bogus_comment = ->
3936                 next_gt = txt.indexOf '>', cur
3937                 if next_gt is -1
3938                         val = txt.substr cur
3939                         cur = txt.length
3940                 else
3941                         val = txt.substr cur, (next_gt - cur)
3942                         cur = next_gt + 1
3943                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3944                 tok_cur_tag.text += val
3945                 tok_state = tok_state_data
3946                 return tok_cur_tag
3947
3948         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3949         tok_state_markup_declaration_open = ->
3950                 if txt.substr(cur, 2) is '--'
3951                         cur += 2
3952                         tok_cur_tag = new_comment_token ''
3953                         tok_state = tok_state_comment_start
3954                         return
3955                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3956                         cur += 7
3957                         tok_state = tok_state_doctype
3958                         return
3959                 acn = adjusted_current_node()
3960                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3961                         cur += 7
3962                         tok_state = tok_state_cdata_section
3963                         return
3964                 # Otherwise
3965                 parse_error()
3966                 tok_cur_tag = new_comment_token ''
3967                 tok_state = tok_state_bogus_comment
3968                 return
3969
3970         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3971         tok_state_comment_start = ->
3972                 switch c = txt.charAt(cur++)
3973                         when '-'
3974                                 tok_state = tok_state_comment_start_dash
3975                         when "\u0000"
3976                                 parse_error()
3977                                 tok_state = tok_state_comment
3978                                 return new_character_token "\ufffd"
3979                         when '>'
3980                                 parse_error()
3981                                 tok_state = tok_state_data
3982                                 return tok_cur_tag
3983                         when '' # EOF
3984                                 parse_error()
3985                                 tok_state = tok_state_data
3986                                 cur -= 1 # Reconsume
3987                                 return tok_cur_tag
3988                         else
3989                                 tok_cur_tag.text += c
3990                                 tok_state = tok_state_comment
3991                 return null
3992
3993         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3994         tok_state_comment_start_dash = ->
3995                 switch c = txt.charAt(cur++)
3996                         when '-'
3997                                 tok_state = tok_state_comment_end
3998                         when "\u0000"
3999                                 parse_error()
4000                                 tok_cur_tag.text += "-\ufffd"
4001                                 tok_state = tok_state_comment
4002                         when '>'
4003                                 parse_error()
4004                                 tok_state = tok_state_data
4005                                 return tok_cur_tag
4006                         when '' # EOF
4007                                 parse_error()
4008                                 tok_state = tok_state_data
4009                                 cur -= 1 # Reconsume
4010                                 return tok_cur_tag
4011                         else
4012                                 tok_cur_tag.text += "-#{c}"
4013                                 tok_state = tok_state_comment
4014                 return null
4015
4016         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4017         tok_state_comment = ->
4018                 switch c = txt.charAt(cur++)
4019                         when '-'
4020                                 tok_state = tok_state_comment_end_dash
4021                         when "\u0000"
4022                                 parse_error()
4023                                 tok_cur_tag.text += "\ufffd"
4024                         when '' # EOF
4025                                 parse_error()
4026                                 tok_state = tok_state_data
4027                                 cur -= 1 # Reconsume
4028                                 return tok_cur_tag
4029                         else
4030                                 tok_cur_tag.text += c
4031                 return null
4032
4033         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4034         tok_state_comment_end_dash = ->
4035                 switch c = txt.charAt(cur++)
4036                         when '-'
4037                                 tok_state = tok_state_comment_end
4038                         when "\u0000"
4039                                 parse_error()
4040                                 tok_cur_tag.text += "-\ufffd"
4041                                 tok_state = tok_state_comment
4042                         when '' # EOF
4043                                 parse_error()
4044                                 tok_state = tok_state_data
4045                                 cur -= 1 # Reconsume
4046                                 return tok_cur_tag
4047                         else
4048                                 tok_cur_tag.text += "-#{c}"
4049                                 tok_state = tok_state_comment
4050                 return null
4051
4052         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4053         tok_state_comment_end = ->
4054                 switch c = txt.charAt(cur++)
4055                         when '>'
4056                                 tok_state = tok_state_data
4057                                 return tok_cur_tag
4058                         when "\u0000"
4059                                 parse_error()
4060                                 tok_cur_tag.text += "--\ufffd"
4061                                 tok_state = tok_state_comment
4062                         when '!'
4063                                 parse_error()
4064                                 tok_state = tok_state_comment_end_bang
4065                         when '-'
4066                                 parse_error()
4067                                 tok_cur_tag.text += '-'
4068                         when '' # EOF
4069                                 parse_error()
4070                                 tok_state = tok_state_data
4071                                 cur -= 1 # Reconsume
4072                                 return tok_cur_tag
4073                         else
4074                                 parse_error()
4075                                 tok_cur_tag.text += "--#{c}"
4076                                 tok_state = tok_state_comment
4077                 return null
4078
4079         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4080         tok_state_comment_end_bang = ->
4081                 switch c = txt.charAt(cur++)
4082                         when '-'
4083                                 tok_cur_tag.text += "--!#{c}"
4084                                 tok_state = tok_state_comment_end_dash
4085                         when '>'
4086                                 tok_state = tok_state_data
4087                                 return tok_cur_tag
4088                         when "\u0000"
4089                                 parse_error()
4090                                 tok_cur_tag.text += "--!\ufffd"
4091                                 tok_state = tok_state_comment
4092                         when '' # EOF
4093                                 parse_error()
4094                                 tok_state = tok_state_data
4095                                 cur -= 1 # Reconsume
4096                                 return tok_cur_tag
4097                         else
4098                                 tok_cur_tag.text += "--!#{c}"
4099                                 tok_state = tok_state_comment
4100                 return null
4101
4102         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4103         tok_state_doctype = ->
4104                 switch c = txt.charAt(cur++)
4105                         when "\t", "\u000a", "\u000c", ' '
4106                                 tok_state = tok_state_before_doctype_name
4107                         when '' # EOF
4108                                 parse_error()
4109                                 tok_state = tok_state_data
4110                                 el = new_doctype_token ''
4111                                 el.flag 'force-quirks', true
4112                                 cur -= 1 # Reconsume
4113                                 return el
4114                         else
4115                                 parse_error()
4116                                 tok_state = tok_state_before_doctype_name
4117                                 cur -= 1 # Reconsume
4118                 return null
4119
4120         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4121         tok_state_before_doctype_name = ->
4122                 c = txt.charAt(cur++)
4123                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4124                         return
4125                 if is_uc_alpha(c)
4126                         tok_cur_tag = new_doctype_token c.toLowerCase()
4127                         tok_state = tok_state_doctype_name
4128                         return
4129                 if c is "\u0000"
4130                         parse_error()
4131                         tok_cur_tag = new_doctype_token "\ufffd"
4132                         tok_state = tok_state_doctype_name
4133                         return
4134                 if c is '>'
4135                         parse_error()
4136                         el = new_doctype_token ''
4137                         el.flag 'force-quirks', true
4138                         tok_state = tok_state_data
4139                         return el
4140                 if c is '' # EOF
4141                         parse_error()
4142                         tok_state = tok_state_data
4143                         el = new_doctype_token ''
4144                         el.flag 'force-quirks', true
4145                         cur -= 1 # Reconsume
4146                         return el
4147                 # Anything else
4148                 tok_cur_tag = new_doctype_token c
4149                 tok_state = tok_state_doctype_name
4150                 return null
4151
4152         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4153         tok_state_doctype_name = ->
4154                 c = txt.charAt(cur++)
4155                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4156                         tok_state = tok_state_after_doctype_name
4157                         return
4158                 if c is '>'
4159                         tok_state = tok_state_data
4160                         return tok_cur_tag
4161                 if is_uc_alpha(c)
4162                         tok_cur_tag.name += c.toLowerCase()
4163                         return
4164                 if c is "\u0000"
4165                         parse_error()
4166                         tok_cur_tag.name += "\ufffd"
4167                         return
4168                 if c is '' # EOF
4169                         parse_error()
4170                         tok_state = tok_state_data
4171                         tok_cur_tag.flag 'force-quirks', true
4172                         cur -= 1 # Reconsume
4173                         return tok_cur_tag
4174                 # Anything else
4175                 tok_cur_tag.name += c
4176                 return null
4177
4178         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4179         tok_state_after_doctype_name = ->
4180                 c = txt.charAt(cur++)
4181                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4182                         return
4183                 if c is '>'
4184                         tok_state = tok_state_data
4185                         return tok_cur_tag
4186                 if c is '' # EOF
4187                         parse_error()
4188                         tok_state = tok_state_data
4189                         tok_cur_tag.flag 'force-quirks', true
4190                         cur -= 1 # Reconsume
4191                         return tok_cur_tag
4192                 # Anything else
4193                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4194                         cur += 5
4195                         tok_state = tok_state_after_doctype_public_keyword
4196                         return
4197                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4198                         cur += 5
4199                         tok_state = tok_state_after_doctype_system_keyword
4200                         return
4201                 parse_error()
4202                 tok_cur_tag.flag 'force-quirks', true
4203                 tok_state = tok_state_bogus_doctype
4204                 return null
4205
4206         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4207         tok_state_after_doctype_public_keyword = ->
4208                 c = txt.charAt(cur++)
4209                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4210                         tok_state = tok_state_before_doctype_public_identifier
4211                         return
4212                 if c is '"'
4213                         parse_error()
4214                         tok_cur_tag.public_identifier = ''
4215                         tok_state = tok_state_doctype_public_identifier_double_quoted
4216                         return
4217                 if c is "'"
4218                         parse_error()
4219                         tok_cur_tag.public_identifier = ''
4220                         tok_state = tok_state_doctype_public_identifier_single_quoted
4221                         return
4222                 if c is '>'
4223                         parse_error()
4224                         tok_cur_tag.flag 'force-quirks', true
4225                         tok_state = tok_state_data
4226                         return tok_cur_tag
4227                 if c is '' # EOF
4228                         parse_error()
4229                         tok_state = tok_state_data
4230                         tok_cur_tag.flag 'force-quirks', true
4231                         cur -= 1 # Reconsume
4232                         return tok_cur_tag
4233                 # Anything else
4234                 parse_error()
4235                 tok_cur_tag.flag 'force-quirks', true
4236                 tok_state = tok_state_bogus_doctype
4237                 return null
4238
4239         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4240         tok_state_before_doctype_public_identifier = ->
4241                 c = txt.charAt(cur++)
4242                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4243                         return
4244                 if c is '"'
4245                         parse_error()
4246                         tok_cur_tag.public_identifier = ''
4247                         tok_state = tok_state_doctype_public_identifier_double_quoted
4248                         return
4249                 if c is "'"
4250                         parse_error()
4251                         tok_cur_tag.public_identifier = ''
4252                         tok_state = tok_state_doctype_public_identifier_single_quoted
4253                         return
4254                 if c is '>'
4255                         parse_error()
4256                         tok_cur_tag.flag 'force-quirks', true
4257                         tok_state = tok_state_data
4258                         return tok_cur_tag
4259                 if c is '' # EOF
4260                         parse_error()
4261                         tok_state = tok_state_data
4262                         tok_cur_tag.flag 'force-quirks', true
4263                         cur -= 1 # Reconsume
4264                         return tok_cur_tag
4265                 # Anything else
4266                 parse_error()
4267                 tok_cur_tag.flag 'force-quirks', true
4268                 tok_state = tok_state_bogus_doctype
4269                 return null
4270
4271
4272         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4273         tok_state_doctype_public_identifier_double_quoted = ->
4274                 c = txt.charAt(cur++)
4275                 if c is '"'
4276                         tok_state = tok_state_after_doctype_public_identifier
4277                         return
4278                 if c is "\u0000"
4279                         parse_error()
4280                         tok_cur_tag.public_identifier += "\ufffd"
4281                         return
4282                 if c is '>'
4283                         parse_error()
4284                         tok_cur_tag.flag 'force-quirks', true
4285                         tok_state = tok_state_data
4286                         return tok_cur_tag
4287                 if c is '' # EOF
4288                         parse_error()
4289                         tok_state = tok_state_data
4290                         tok_cur_tag.flag 'force-quirks', true
4291                         cur -= 1 # Reconsume
4292                         return tok_cur_tag
4293                 # Anything else
4294                 tok_cur_tag.public_identifier += c
4295                 return null
4296
4297         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4298         tok_state_doctype_public_identifier_single_quoted = ->
4299                 c = txt.charAt(cur++)
4300                 if c is "'"
4301                         tok_state = tok_state_after_doctype_public_identifier
4302                         return
4303                 if c is "\u0000"
4304                         parse_error()
4305                         tok_cur_tag.public_identifier += "\ufffd"
4306                         return
4307                 if c is '>'
4308                         parse_error()
4309                         tok_cur_tag.flag 'force-quirks', true
4310                         tok_state = tok_state_data
4311                         return tok_cur_tag
4312                 if c is '' # EOF
4313                         parse_error()
4314                         tok_state = tok_state_data
4315                         tok_cur_tag.flag 'force-quirks', true
4316                         cur -= 1 # Reconsume
4317                         return tok_cur_tag
4318                 # Anything else
4319                 tok_cur_tag.public_identifier += c
4320                 return null
4321
4322         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4323         tok_state_after_doctype_public_identifier = ->
4324                 c = txt.charAt(cur++)
4325                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4326                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4327                         return
4328                 if c is '>'
4329                         tok_state = tok_state_data
4330                         return tok_cur_tag
4331                 if c is '"'
4332                         parse_error()
4333                         tok_cur_tag.system_identifier = ''
4334                         tok_state = tok_state_doctype_system_identifier_double_quoted
4335                         return
4336                 if c is "'"
4337                         parse_error()
4338                         tok_cur_tag.system_identifier = ''
4339                         tok_state = tok_state_doctype_system_identifier_single_quoted
4340                         return
4341                 if c is '' # EOF
4342                         parse_error()
4343                         tok_state = tok_state_data
4344                         tok_cur_tag.flag 'force-quirks', true
4345                         cur -= 1 # Reconsume
4346                         return tok_cur_tag
4347                 # Anything else
4348                 parse_error()
4349                 tok_cur_tag.flag 'force-quirks', true
4350                 tok_state = tok_state_bogus_doctype
4351                 return null
4352
4353         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4354         tok_state_between_doctype_public_and_system_identifiers = ->
4355                 c = txt.charAt(cur++)
4356                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4357                         return
4358                 if c is '>'
4359                         tok_state = tok_state_data
4360                         return tok_cur_tag
4361                 if c is '"'
4362                         parse_error()
4363                         tok_cur_tag.system_identifier = ''
4364                         tok_state = tok_state_doctype_system_identifier_double_quoted
4365                         return
4366                 if c is "'"
4367                         parse_error()
4368                         tok_cur_tag.system_identifier = ''
4369                         tok_state = tok_state_doctype_system_identifier_single_quoted
4370                         return
4371                 if c is '' # EOF
4372                         parse_error()
4373                         tok_state = tok_state_data
4374                         tok_cur_tag.flag 'force-quirks', true
4375                         cur -= 1 # Reconsume
4376                         return tok_cur_tag
4377                 # Anything else
4378                 parse_error()
4379                 tok_cur_tag.flag 'force-quirks', true
4380                 tok_state = tok_state_bogus_doctype
4381                 return null
4382
4383         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4384         tok_state_after_doctype_system_keyword = ->
4385                 c = txt.charAt(cur++)
4386                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4387                         tok_state = tok_state_before_doctype_system_identifier
4388                         return
4389                 if c is '"'
4390                         parse_error()
4391                         tok_cur_tag.system_identifier = ''
4392                         tok_state = tok_state_doctype_system_identifier_double_quoted
4393                         return
4394                 if c is "'"
4395                         parse_error()
4396                         tok_cur_tag.system_identifier = ''
4397                         tok_state = tok_state_doctype_system_identifier_single_quoted
4398                         return
4399                 if c is '>'
4400                         parse_error()
4401                         tok_cur_tag.flag 'force-quirks', true
4402                         tok_state = tok_state_data
4403                         return tok_cur_tag
4404                 if c is '' # EOF
4405                         parse_error()
4406                         tok_state = tok_state_data
4407                         tok_cur_tag.flag 'force-quirks', true
4408                         cur -= 1 # Reconsume
4409                         return tok_cur_tag
4410                 # Anything else
4411                 parse_error()
4412                 tok_cur_tag.flag 'force-quirks', true
4413                 tok_state = tok_state_bogus_doctype
4414                 return null
4415
4416         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4417         tok_state_before_doctype_system_identifier = ->
4418                 c = txt.charAt(cur++)
4419                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4420                         return
4421                 if c is '"'
4422                         tok_cur_tag.system_identifier = ''
4423                         tok_state = tok_state_doctype_system_identifier_double_quoted
4424                         return
4425                 if c is "'"
4426                         tok_cur_tag.system_identifier = ''
4427                         tok_state = tok_state_doctype_system_identifier_single_quoted
4428                         return
4429                 if c is '>'
4430                         parse_error()
4431                         tok_cur_tag.flag 'force-quirks', true
4432                         tok_state = tok_state_data
4433                         return tok_cur_tag
4434                 if c is '' # EOF
4435                         parse_error()
4436                         tok_state = tok_state_data
4437                         tok_cur_tag.flag 'force-quirks', true
4438                         cur -= 1 # Reconsume
4439                         return tok_cur_tag
4440                 # Anything else
4441                 parse_error()
4442                 tok_cur_tag.flag 'force-quirks', true
4443                 tok_state = tok_state_bogus_doctype
4444                 return null
4445
4446         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4447         tok_state_doctype_system_identifier_double_quoted = ->
4448                 c = txt.charAt(cur++)
4449                 if c is '"'
4450                         tok_state = tok_state_after_doctype_system_identifier
4451                         return
4452                 if c is "\u0000"
4453                         parse_error()
4454                         tok_cur_tag.system_identifier += "\ufffd"
4455                         return
4456                 if c is '>'
4457                         parse_error()
4458                         tok_cur_tag.flag 'force-quirks', true
4459                         tok_state = tok_state_data
4460                         return tok_cur_tag
4461                 if c is '' # EOF
4462                         parse_error()
4463                         tok_state = tok_state_data
4464                         tok_cur_tag.flag 'force-quirks', true
4465                         cur -= 1 # Reconsume
4466                         return tok_cur_tag
4467                 # Anything else
4468                 tok_cur_tag.system_identifier += c
4469                 return null
4470
4471         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4472         tok_state_doctype_system_identifier_single_quoted = ->
4473                 c = txt.charAt(cur++)
4474                 if c is "'"
4475                         tok_state = tok_state_after_doctype_system_identifier
4476                         return
4477                 if c is "\u0000"
4478                         parse_error()
4479                         tok_cur_tag.system_identifier += "\ufffd"
4480                         return
4481                 if c is '>'
4482                         parse_error()
4483                         tok_cur_tag.flag 'force-quirks', true
4484                         tok_state = tok_state_data
4485                         return tok_cur_tag
4486                 if c is '' # EOF
4487                         parse_error()
4488                         tok_state = tok_state_data
4489                         tok_cur_tag.flag 'force-quirks', true
4490                         cur -= 1 # Reconsume
4491                         return tok_cur_tag
4492                 # Anything else
4493                 tok_cur_tag.system_identifier += c
4494                 return null
4495
4496         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4497         tok_state_after_doctype_system_identifier = ->
4498                 c = txt.charAt(cur++)
4499                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4500                         return
4501                 if c is '>'
4502                         tok_state = tok_state_data
4503                         return tok_cur_tag
4504                 if c is '' # EOF
4505                         parse_error()
4506                         tok_state = tok_state_data
4507                         tok_cur_tag.flag 'force-quirks', true
4508                         cur -= 1 # Reconsume
4509                         return tok_cur_tag
4510                 # Anything else
4511                 parse_error()
4512                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4513                 tok_state = tok_state_bogus_doctype
4514                 return null
4515
4516         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4517         tok_state_bogus_doctype = ->
4518                 c = txt.charAt(cur++)
4519                 if c is '>'
4520                         tok_state = tok_state_data
4521                         return tok_cur_tag
4522                 if c is '' # EOF
4523                         tok_state = tok_state_data
4524                         cur -= 1 # Reconsume
4525                         return tok_cur_tag
4526                 # Anything else
4527                 return null
4528
4529         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4530         tok_state_cdata_section = ->
4531                 tok_state = tok_state_data
4532                 next_gt = txt.indexOf ']]>', cur
4533                 if next_gt is -1
4534                         val = txt.substr cur
4535                         cur = txt.length
4536                 else
4537                         val = txt.substr cur, (next_gt - cur)
4538                         cur = next_gt + 3
4539                 if val.length > 0
4540                         return new_character_token val # fixfull split
4541                 return null
4542
4543         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4544         # Don't set this as a state, just call it
4545         # returns a string (NOT a text node)
4546         parse_character_reference = (allowed_char = null, in_attr = false) ->
4547                 if cur >= txt.length
4548                         return '&'
4549                 switch c = txt.charAt(cur)
4550                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4551                                 # explicitly not a parse error
4552                                 return '&'
4553                         when ';'
4554                                 # there has to be "one or more" alnums between & and ; to be a parse error
4555                                 return '&'
4556                         when '#'
4557                                 if cur + 1 >= txt.length
4558                                         return '&'
4559                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4560                                         base = 16
4561                                         charset = hex_chars
4562                                         start = cur + 2
4563                                 else
4564                                         charset = digits
4565                                         start = cur + 1
4566                                         base = 10
4567                                 i = 0
4568                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4569                                         i += 1
4570                                 if i is 0
4571                                         return '&'
4572                                 cur = start + i
4573                                 if txt.charAt(start + i) is ';'
4574                                         cur += 1
4575                                 else
4576                                         parse_error()
4577                                 code_point = txt.substr(start, i)
4578                                 while code_point.charAt(0) is '0' and code_point.length > 1
4579                                         code_point = code_point.substr 1
4580                                 code_point = parseInt(code_point, base)
4581                                 if unicode_fixes[code_point]?
4582                                         parse_error()
4583                                         return unicode_fixes[code_point]
4584                                 else
4585                                         if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4586                                                 parse_error()
4587                                                 return "\ufffd"
4588                                         else
4589                                                 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4590                                                         parse_error()
4591                                                 return from_code_point code_point
4592                                 return
4593                         else
4594                                 for i in [0...31]
4595                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4596                                                 break
4597                                 if i is 0
4598                                         # exit early, because parse_error() below needs at least one alnum
4599                                         return '&'
4600                                 if txt.charAt(cur + i) is ';'
4601                                         i += 1 # include ';' terminator in value
4602                                         decoded = decode_named_char_ref txt.substr(cur, i)
4603                                         if decoded?
4604                                                 cur += i
4605                                                 return decoded
4606                                         parse_error()
4607                                         return '&'
4608                                 else
4609                                         # no ';' terminator (only legacy char refs)
4610                                         max = i
4611                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4612                                                 c = legacy_char_refs[txt.substr(cur, i)]
4613                                                 if c?
4614                                                         if in_attr
4615                                                                 if txt.charAt(cur + i) is '='
4616                                                                         # "because some legacy user agents will
4617                                                                         # misinterpret the markup in those cases"
4618                                                                         parse_error()
4619                                                                         return '&'
4620                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4621                                                                         # this makes attributes forgiving about url args
4622                                                                         return '&'
4623                                                         # ok, and besides the weird exceptions for attributes...
4624                                                         # return the matching char
4625                                                         cur += i # consume entity chars
4626                                                         parse_error() # because no terminating ";"
4627                                                         return c
4628                                         parse_error()
4629                                         return '&'
4630                 return # never reached
4631
4632         # tree constructor initialization
4633         # see comments on TYPE_TAG/etc for the structure of this data
4634         txt = args.html
4635         cur = 0
4636         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4637         doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4638         open_els = []
4639         afe = [] # active formatting elements
4640         template_ins_modes = []
4641         ins_mode = ins_mode_initial
4642         original_ins_mode = ins_mode # TODO check spec
4643         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4644         flag_frameset_ok = true
4645         flag_parsing = true
4646         flag_foster_parenting = false
4647         form_element_pointer = null
4648         temporary_buffer = null
4649         pending_table_character_tokens = []
4650         head_element_pointer = null
4651         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4652         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4653         prev_node_id = 0 # just for debugging
4654
4655         # tokenizer initialization
4656         tok_state = tok_state_data
4657
4658         # text pre-processing
4659         # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4660         txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4661         txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4662         txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4663
4664         if args.name is "tests23.dat #1"
4665                 console.log "hi"
4666         # proccess input
4667         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4668         while flag_parsing
4669                 t = tok_state()
4670                 if t?
4671                         process_token t
4672                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4673         return doc.children
4674
4675 serialize_els = (els, shallow, show_ids) ->
4676         serialized = ''
4677         sep = ''
4678         for t in els
4679                 serialized += sep
4680                 sep = ','
4681                 serialized += t.serialize shallow, show_ids
4682         return serialized
4683
4684 module.exports.parse_html = parse_html
4685 module.exports.debug_log_reset = debug_log_reset
4686 module.exports.debug_log_each = debug_log_each
4687 module.exports.TYPE_TAG = TYPE_TAG
4688 module.exports.TYPE_TEXT = TYPE_TEXT
4689 module.exports.TYPE_COMMENT = TYPE_COMMENT
4690 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4691 module.exports.NS_HTML = NS_HTML
4692 module.exports.NS_MATHML = NS_MATHML
4693 module.exports.NS_SVG = NS_SVG
4694 module.exports.QUIRKS_NO = QUIRKS_NO
4695 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4696 module.exports.QUIRKS_YES = QUIRKS_YES