JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implemented adoption agency algorithm
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost"
42 #   5: f
43 #   4: e "previous" (to d), "above"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
54 TYPE_COMMENT = 2
55 TYPE_DOCTYPE = 3
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
59 TYPE_EOF = 6
60 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
62
63 # namespace constants
64 NS_HTML = 1
65 NS_MATHML = 2
66 NS_SVG = 3
67
68 g_debug_log = []
69 debug_log_reset = ->
70         g_debug_log = []
71 debug_log = (str) ->
72         g_debug_log.push str
73 debug_log_each = (cb) ->
74         for str in g_debug_log
75                 cb str
76
77 prev_node_id = 0
78 class Node
79         constructor: (type, args = {}) ->
80                 @type = type # one of the TYPE_* constants above
81                 @name = args.name ? '' # tag name
82                 @text = args.text ? '' # contents for text/comment nodes
83                 @attrs = args.attrs ? {}
84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
85                 @children = args.children ? []
86                 @namespace = args.namespace ? NS_HTML
87                 @parent = args.parent ? null
88                 if args.id?
89                         @id = "#{args.id}+"
90                 else
91                         @id = "#{++prev_node_id}"
92         shallow_clone: -> # return a new node that's the same except without the children or parent
93                 # WARNING this doesn't work right on open tags that are still being parsed
94                 attrs = {}
95                 attrs[k] = v for k, v of @attrs
96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97         serialize: (shallow = false, show_ids = false) -> # for unit tests
98                 ret = ''
99                 switch @type
100                         when TYPE_TAG
101                                 ret += 'tag:'
102                                 ret += JSON.stringify @name
103                                 ret += ','
104                                 if show_ids
105                                         ret += "##{@id},"
106                                 if shallow
107                                         break
108                                 ret += JSON.stringify @attrs
109                                 ret += ',['
110                                 sep = ''
111                                 for c in @children
112                                         ret += sep
113                                         sep = ','
114                                         ret += c.serialize shallow, show_ids
115                                 ret += ']'
116                         when TYPE_TEXT
117                                 ret += 'text:'
118                                 ret += JSON.stringify @text
119                         when TYPE_COMMENT
120                                 ret += 'comment:'
121                                 ret += JSON.stringify @text
122                         when TYPE_DOCTYPE
123                                 ret += 'doctype'
124                                 # FIXME
125                         when TYPE_MARKER
126                                 ret += 'marker'
127                         when TYPE_AAA_BOOKMARK
128                                 ret += 'aaa_bookmark'
129                         else
130                                 ret += 'unknown:'
131                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
132                 return ret
133
134 # helpers: (only take args that are normally known when parser creates nodes)
135 new_open_tag = (name) ->
136         return new Node TYPE_OPEN_TAG, name: name
137 new_end_tag = (name) ->
138         return new Node TYPE_END_TAG, name: name
139 new_element = (name) ->
140         return new Node TYPE_TAG, name: name
141 new_text_node = (txt) ->
142         return new Node TYPE_TEXT, text: txt
143 new_comment_node = (txt) ->
144         return new Node TYPE_COMMENT, text: txt
145 new_eof_token = ->
146         return new Node TYPE_EOF
147 new_aaa_bookmark = ->
148         return new Node TYPE_AAA_BOOKMARK
149
150 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
151 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
152 digits = "0123456789"
153 alnum = lc_alpha + uc_alpha + digits
154 hex_chars = digits + "abcdefABCDEF"
155
156 # some SVG elements have dashes in them
157 tag_name_chars = alnum + "-"
158
159 # http://www.w3.org/TR/html5/infrastructure.html#space-character
160 space_chars = "\u0009\u000a\u000c\u000d\u0020"
161
162 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
163 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
164
165 # These are the character references that don't need a terminating semicolon
166 # min length: 2, max: 6, none are a prefix of any other.
167 legacy_char_refs = {
168         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
169         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
170         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
171         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
172         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
173         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
174         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
175         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
176         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
177         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
178         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
179         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
180         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
181         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
182         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
183         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
184         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
185         yen: '¥', yuml: 'ÿ'
186 }
187
188 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
189 raw_text_elements = ['script', 'style']
190 escapable_raw_text_elements = ['textarea', 'title']
191 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
192 svg_elements = [
193         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
194         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
195         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
196         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
197         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
198         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
199         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
200         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
201         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
202         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
203         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
204         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
205         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
206         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
207         'view', 'vkern'
208 ]
209
210 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
211 mathml_elements = [
212         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
213         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
214         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
215         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
216         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
217         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
218         'determinant', 'diff', 'divergence', 'divide', 'domain',
219         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
220         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
221         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
222         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
223         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
224         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
225         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
226         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
227         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
228         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
229         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
230         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
231         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
232         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
233         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
234         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
235         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
236         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
237         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
238         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
239         'vectorproduct', 'xor'
240 ]
241 # foreign_elements = [svg_elements..., mathml_elements...]
242 #normal_elements = All other allowed HTML elements are normal elements.
243
244 special_elements = {
245         # HTML:
246         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
247         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
248         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
249         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
250         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
251         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
252         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
253         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
254         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
255         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
256         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
257         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
258         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
259         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
260         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
261         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
262         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
263         wbr:NS_HTML, xmp:NS_HTML,
264
265         # MathML:
266         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
267         'annotation-xml':NS_MATHML,
268
269         # SVG:
270         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
271 }
272
273 formatting_elements = {
274          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
275          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
276          u: true
277 }
278
279 # all html I presume
280 end_tag_implied = {
281         dd: true
282         dt: true
283         li: true
284         option: true
285         optgroup: true
286         p: true
287         rb: true
288         rp: true
289         rt: true
290         rtc: true
291 }
292
293 el_is_special = (e) ->
294         return special_elements[e.name]?
295         # FIXME it should really be:
296         #return special_elements[e.name] is e.namespace
297
298 # decode_named_char_ref()
299 #
300 # The list of named character references is _huge_ so ask the browser to decode
301 # for us instead of wasting bandwidth/space on including the table here.
302 #
303 # Pass without the "&" but with the ";" examples:
304 #    for "&amp" pass "amp;"
305 #    for "&#x2032" pass "x2032;"
306 g_dncr = {
307         cache: {}
308         textarea: document.createElement('textarea')
309 }
310 # TODO test this in IE8
311 decode_named_char_ref = (txt) ->
312         txt = "&#{txt}"
313         decoded = g_dncr.cache[txt]
314         return decoded if decoded?
315         g_dncr.textarea.innerHTML = txt
316         decoded = g_dncr.textarea.value
317         return null if decoded is txt
318         return g_dncr.cache[txt] = decoded
319
320 parse_html = (txt, parse_error_cb = null) ->
321         cur = 0 # index of next char in txt to be parsed
322         # declare tree and tokenizer variables so they're in scope below
323         tree = null
324         open_els = [] # stack of open elements
325         tree_state = null
326         tok_state = null
327         tok_cur_tag = null # partially parsed tag
328         flag_frameset_ok = null
329         flag_parsing = null
330         flag_foster_parenting = null
331         afe = [] # active formatting elements
332
333         parse_error = ->
334                 if parse_error_cb?
335                         parse_error_cb cur
336                 else
337                         console.log "Parse error at character #{cur} of #{txt.length}"
338
339
340         # the functions below impliment the Tree Contstruction algorithm
341         # http://www.w3.org/TR/html5/syntax.html#tree-construction
342
343         # But first... the helpers
344         template_tag_is_open = ->
345                 for t in open_els
346                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
347                                 return true
348                 return false
349         is_in_scope_x = (tag_name, scope, namespace) ->
350                 for t in open_els
351                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
352                                 return true
353                         if scope[t.name] is t.namespace
354                                 return false
355                 return false
356         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
357                 for t in open_els
358                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
359                                 return true
360                         if scope[t.name] is t.namespace
361                                 return false
362                         if scope2[t.name] is t.namespace
363                                 return false
364                 return false
365         standard_scopers = { # FIXME these are supposed to be namespace specific
366                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
367                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
368                 template: NS_HTML, mi: NS_MATHML,
369
370                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
371                 'annotation-xml': NS_MATHML,
372
373                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
374         }
375         button_scopers = button: NS_HTML
376         li_scopers = ol: NS_HTML, ul: NS_HTML
377         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
378         is_in_scope = (tag_name, namespace = null) ->
379                 return is_in_scope_x tag_name, standard_scopers, namespace
380         is_in_button_scope = (tag_name, namespace = null) ->
381                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
382         is_in_table_scope = (tag_name, namespace = null) ->
383                 return is_in_scope_x tag_name, table_scopers, namespace
384         is_in_select_scope = (tag_name, namespace = null) ->
385                 for t in open_els
386                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
387                                 return true
388                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
389                                 return false
390                 return false
391         # this checks for a particular element, not by name
392         el_is_in_scope = (el) ->
393                 for t in open_els
394                         if t is el
395                                 return true
396                         if standard_scopers[t.name] is t.namespace
397                                 return false
398                 return false
399
400         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
401         # this implementation is structured (mostly) as described at the link above.
402         # capitalized comments are the "labels" described at the link above.
403         reconstruct_active_formatting_elements = ->
404                 return if afe.length is 0
405                 if afe[0].type is TYPE_MARKER or afe[0] in open_els
406                         return
407                 # Rewind
408                 i = 0
409                 loop
410                         if i is afe.length - 1
411                                 break
412                         i += 1
413                         if afe[i].type is TYPE_MARKER or afe[i] in open_els
414                                 i -= 1 # Advance
415                                 break
416                 # Create
417                 loop
418                         el = afe[i].shallow_clone()
419                         tree_insert_element el
420                         afe[i] = el
421                         break if i is 0
422                         i -= 1
423
424         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
425         # adoption agency algorithm
426         # overview here:
427         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
428         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
429         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
430         adoption_agency = (subject) ->
431                 if open_els[0].name is subject
432                         el = open_els[0]
433                         open_els.shift()
434                         # remove it from the list of active formatting elements (if found)
435                         for t, i in afe
436                                 if t is el
437                                         afe.splice i, 1
438                                         break
439                         return
440                 outer = 0
441                 loop
442                         if outer >= 8
443                                 return
444                         outer += 1
445                         # 5. Let formatting element be the last element in the list of
446                         # active formatting elements that: is between the end of the list
447                         # and the last scope marker in the list, if any, or the start of
448                         # the list otherwise, and  has the tag name subject.
449                         fe = null
450                         for t, fe_of_afe in afe
451                                 if t.type is TYPE_MARKER
452                                         break
453                                 if t.name is subject
454                                         fe = t
455                                         break
456                         # If there is no such element, then abort these steps and instead
457                         # act as described in the "any other end tag" entry above.
458                         if fe is null
459                                 in_body_any_other_end_tag subject
460                                 return
461                         # 6. If formatting element is not in the stack of open elements,
462                         # then this is a parse error; remove the element from the list, and
463                         # abort these steps.
464                         in_open_els = false
465                         for t, fe_of_open_els in open_els
466                                 if t is fe
467                                         in_open_els = true
468                                         break
469                         unless in_open_els
470                                 parse_error()
471                                 # "remove it from the list" must mean afe, since it's not in open_els
472                                 afe.splice fe_of_afe, 1
473                                 return
474                         # 7. If formatting element is in the stack of open elements, but
475                         # the element is not in scope, then this is a parse error; abort
476                         # these steps.
477                         unless el_is_in_scope fe
478                                 parse_error()
479                                 return
480                         # 8. If formatting element is not the current node, this is a parse
481                         # error. (But do not abort these steps.)
482                         unless open_els[0] is fe
483                                 parse_error()
484                                 # continue
485                         # 9. Let furthest block be the topmost node in the stack of open
486                         # elements that is lower in the stack than formatting element, and
487                         # is an element in the special category. There might not be one.
488                         fb = null
489                         fb_of_open_els = null
490                         for t, i in open_els
491                                 if t is fe
492                                         break
493                                 if el_is_special t
494                                         fb = t
495                                         fb_of_open_els = i
496                                         # and continue, to see if there's one that's more "topmost"
497                         # 10. If there is no furthest block, then the UA must first pop all
498                         # the nodes from the bottom of the stack of open elements, from the
499                         # current node up to and including formatting element, then remove
500                         # formatting element from the list of active formatting elements,
501                         # and finally abort these steps.
502                         if fb is null
503                                 loop
504                                         t = open_els.shift()
505                                         if t is fe
506                                                 afe.splice fe_of_afe, 1
507                                                 return
508                         # 11. Let common ancestor be the element immediately above
509                         # formatting element in the stack of open elements.
510                         ca = open_els[fe_of_open_els + 1] # common ancestor
511
512                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
513                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
514                         bookmark = new_aaa_bookmark()
515                         for t, i in afe
516                                 if t is fe
517                                         afe.splice i, 0, bookmark
518                                         break
519                         node = last_node = fb
520                         inner = 0
521                         loop
522                                 inner += 1
523                                 # 3. Let node be the element immediately above node in the
524                                 # stack of open elements, or if node is no longer in the stack
525                                 # of open elements (e.g. because it got removed by this
526                                 # algorithm), the element that was immediately above node in
527                                 # the stack of open elements before node was removed.
528                                 node_next = null
529                                 for t, i in open_els
530                                         if t is node
531                                                 node_next = open_els[i + 1]
532                                                 break
533                                 node = node_next ? node_above
534                                 debug_log "inner loop #{inner}"
535                                 debug_log "open_els: #{serialize_els open_els, true, true}"
536                                 debug_log "tree: #{serialize_els tree.children, false, true}"
537                                 debug_log "afe: #{serialize_els afe, true, true}"
538                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
539                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
540                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
541                                 debug_log "node: #{node.serialize true, true}"
542                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
543
544                                 # 4. If node is formatting element, then go to the next step in
545                                 # the overall algorithm.
546                                 if node is fe
547                                         break
548                                 debug_log "the meat"
549                                 # 5. If inner loop counter is greater than three and node is in
550                                 # the list of active formatting elements, then remove node from
551                                 # the list of active formatting elements.
552                                 node_in_afe = false
553                                 for t, i in afe
554                                         if t is node
555                                                 if inner > 3
556                                                         afe.splice i, 1
557                                                         debug_log "max out inner"
558                                                 else
559                                                         node_in_afe = true
560                                                         debug_log "in afe"
561                                                 break
562                                 # 6. If node is not in the list of active formatting elements,
563                                 # then remove node from the stack of open elements and then go
564                                 # back to the step labeled inner loop.
565                                 unless node_in_afe
566                                         debug_log "not in afe"
567                                         for t, i in open_els
568                                                 if t is node
569                                                         node_above = open_els[i + 1]
570                                                         open_els.splice i, 1
571                                                         break
572                                         continue
573                                 debug_log "the bones"
574                                 # 7. create an element for the token for which the element node
575                                 # was created, in the HTML namespace, with common ancestor as
576                                 # the intended parent; replace the entry for node in the list
577                                 # of active formatting elements with an entry for the new
578                                 # element, replace the entry for node in the stack of open
579                                 # elements with an entry for the new element, and let node be
580                                 # the new element.
581                                 new_node = node.shallow_clone()
582                                 for t, i in afe
583                                         if t is node
584                                                 afe[i] = new_node
585                                                 debug_log "replaced in afe"
586                                                 break
587                                 for t, i in open_els
588                                         if t is node
589                                                 node_above = open_els[i + 1]
590                                                 open_els[i] = new_node
591                                                 debug_log "replaced in open_els"
592                                                 break
593                                 node = new_node
594                                 # 8. If last node is furthest block, then move the
595                                 # aforementioned bookmark to be immediately after the new node
596                                 # in the list of active formatting elements.
597                                 if last_node is fb
598                                         for t, i in afe
599                                                 if t is bookmark
600                                                         afe.splice i, 1
601                                                         debug_log "removed bookmark"
602                                                         break
603                                         for t, i in afe
604                                                 if t is node
605                                                         # "after" means lower
606                                                         afe.splice i, 0, bookmark # "after as <-
607                                                         debug_log "placed bookmark after node"
608                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
609                                                         break
610                                 # 9. Insert last node into node, first removing it from its
611                                 # previous parent node if any.
612                                 if last_node.parent?
613                                         debug_log "last_node has parent"
614                                         for c, i in last_node.parent.children
615                                                 if c is last_node
616                                                         debug_log "removing last_node from parent"
617                                                         last_node.parent.children.splice i, 1
618                                                         break
619                                 node.children.push last_node
620                                 last_node.parent = node
621                                 # 10. Let last node be node.
622                                 last_node = node
623                                 debug_log "at last"
624                                 # 11. Return to the step labeled inner loop.
625                         # 14. Insert whatever last node ended up being in the previous step
626                         # at the appropriate place for inserting a node, but using common
627                         # ancestor as the override target.
628
629                         # JASON: In the case where fe is immediately followed by fb:
630                         #   * inner loop exits out early (node==fe)
631                         #   * last_node is fb
632                         #   * last_node is still in the tree (not a duplicate)
633                         if last_node.parent?
634                                 debug_log "FEFIRST? last_node has parent"
635                                 for c, i in last_node.parent.children
636                                         if c is last_node
637                                                 debug_log "removing last_node from parent"
638                                                 last_node.parent.children.splice i, 1
639                                                 break
640
641                         debug_log "after aaa inner loop"
642                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
643                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
644                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
645                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
646                         debug_log "tree: #{serialize_els tree.children, false, true}"
647
648                         debug_log "insert"
649
650
651                         # can't use standard insert token thing, because it's already in
652                         # open_els and must stay at it's current position in open_els
653                         dest = adjusted_insertion_location ca
654                         dest[0].children.splice dest[1], 0, last_node
655                         last_node.parent = dest[0]
656
657
658                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
659                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
660                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
661                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
662                         debug_log "tree: #{serialize_els tree.children, false, true}"
663
664                         # 15. Create an element for the token for which formatting element
665                         # was created, in the HTML namespace, with furthest block as the
666                         # intended parent.
667                         new_element = fe.shallow_clone() # FIXME intended parent thing
668                         # 16. Take all of the child nodes of furthest block and append them
669                         # to the element created in the last step.
670                         while fb.children.length
671                                 t = fb.children.shift()
672                                 t.parent = new_element
673                                 new_element.children.push t
674                         # 17. Append that new element to furthest block.
675                         new_element.parent = fb
676                         fb.children.push new_element
677                         # 18. Remove formatting element from the list of active formatting
678                         # elements, and insert the new element into the list of active
679                         # formatting elements at the position of the aforementioned
680                         # bookmark.
681                         for t, i in afe
682                                 if t is fe
683                                         afe.splice i, 1
684                                         break
685                         for t, i in afe
686                                 if t is bookmark
687                                         afe[i] = new_element
688                                         break
689                         # 19. Remove formatting element from the stack of open elements,
690                         # and insert the new element into the stack of open elements
691                         # immediately below the position of furthest block in that stack.
692                         for t, i in open_els
693                                 if t is fe
694                                         open_els.splice i, 1
695                                         break
696                         for t, i in open_els
697                                 if t is fb
698                                         open_els.splice i, 0, new_element
699                                         break
700                         # 20. Jump back to the step labeled outer loop.
701                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
702                         debug_log "tree: #{serialize_els tree.children, false, true}"
703                         debug_log "open_els: #{serialize_els open_els, true, true}"
704                         debug_log "afe: #{serialize_els afe, true, true}"
705                 debug_log "AAA DONE"
706
707         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
708         # FIXME test this (particularly emplied end tags)
709         close_p_element = ->
710                 generate_implied_end_tags 'p' # arg is exception
711                 if open_els[0].name isnt 'p'
712                         parse_error()
713                 while open_els.length > 1 # just in case
714                         t = open_els.shift()
715                         if t.name is 'p'
716                                 return
717         close_p_if_in_button_scope = ->
718                 if is_in_button_scope 'p'
719                         close_a_p_element()
720
721         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
722         tree_insert_text = (t) ->
723                 dest = adjusted_insertion_location()
724                 if dest[1] > 0
725                         prev = dest[0].children[dest[1] - 1]
726                         if prev.type is TYPE_TEXT
727                                 prev.text += t.text
728                                 return
729                 dest[0].children.splice dest[1], 0, t
730
731         # 8.2.5.1
732         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
733         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
734         adjusted_insertion_location = (override_target = null) ->
735                 # 1. If there was an override target specified, then let target be the
736                 # override target.
737                 if override_target?
738                         target = override_target
739                 else # Otherwise, let target be the current node.
740                         target = open_els[0]
741                 # 2. Determine the adjusted insertion location using the first matching
742                 # steps from the following list:
743                 #
744                 # If foster parenting is enabled and target is a table, tbody, tfoot,
745                 # thead, or tr element Foster parenting happens when content is
746                 # misnested in tables.
747                 if flag_foster_parenting and target.name in foster_parenting_targets
748                         console.log "foster parenting isn't implemented yet" # TODO
749                         # 1. Let last template be the last template element in the stack of
750                         # open elements, if any.
751                         # 2. Let last table be the last table element in the stack of open
752                         # elements, if any.
753
754                         # 3. If there is a last template and either there is no last table,
755                         # or there is one, but last template is lower (more recently added)
756                         # than last table in the stack of open elements, then: let adjusted
757                         # insertion location be inside last template's template contents,
758                         # after its last child (if any), and abort these substeps.
759
760                         # 4. If there is no last table, then let adjusted insertion
761                         # location be inside the first element in the stack of open
762                         # elements (the html element), after its last child (if any), and
763                         # abort these substeps. (fragment case)
764
765                         # 5. If last table has a parent element, then let adjusted
766                         # insertion location be inside last table's parent element,
767                         # immediately before last table, and abort these substeps.
768
769                         # 6. Let previous element be the element immediately above last
770                         # table in the stack of open elements.
771
772                         # 7. Let adjusted insertion location be inside previous element,
773                         # after its last child (if any).
774
775                         # Note: These steps are involved in part because it's possible for
776                         # elements, the table element in this case in particular, to have
777                         # been moved by a script around in the DOM, or indeed removed from
778                         # the DOM entirely, after the element was inserted by the parser.
779                 else
780                         # Otherwise Let adjusted insertion location be inside target, after
781                         # its last child (if any).
782                         target_i = target.children.length
783
784                 # 3. If the adjusted insertion location is inside a template element,
785                 # let it instead be inside the template element's template contents,
786                 # after its last child (if any). TODO
787
788                 # 4. Return the adjusted insertion location.
789                 return [target, target_i]
790
791         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
792         # aka create_an_element_for_token
793         token_to_element = (t, namespace, intended_parent) ->
794                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
795                 # convert attributes into a hash
796                 attrs = {}
797                 while t.attrs_a.length
798                         a = t.attrs_a.pop()
799                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
800                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
801
802                 # TODO 2. If the newly created element has an xmlns attribute in the
803                 # XMLNS namespace whose value is not exactly the same as the element's
804                 # namespace, that is a parse error. Similarly, if the newly created
805                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
806                 # value is not the XLink Namespace, that is a parse error.
807
808                 # fixfull: the spec says stuff about form pointers and ownerDocument
809
810                 return el
811
812         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
813         insert_foreign_element = (token, namespace) ->
814                 ail = adjusted_insertion_location()
815                 ail_el = ail[0]
816                 ail_i = ail[1]
817                 el = token_to_element token, namespace, ail_el
818                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
819                 el.parent = ail_el
820                 ail_el.children.splice ail_i, 0, el
821                 open_els.unshift el
822                 return el
823         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
824         insert_html_element = insert_foreign_element # (token, namespace) ->
825
826         # FIXME read implement "foster parenting" part
827         # FIXME read spec, do this right
828         # FIXME implement the override target thing
829         # note: this assumes it's an open tag
830         # FIXME what part of the spec is this?
831         # TODO look through all callers of this, and see what they should really be doing.
832         #   eg probably insert_html_element for tokens
833         tree_insert_element = (el, override_target = null, namespace = null) ->
834                 if namespace?
835                         el.namespace = namespace
836                 dest = adjusted_insertion_location override_target
837                 if el.type is TYPE_OPEN_TAG # means it's a "token"
838                         el = token_to_element el, namespace, dest[0]
839                 unless el.namespace?
840                         namespace = dest.namespace
841                 # fixfull: Document nodes sometimes can't accept more chidren
842                 dest[0].children.splice dest[1], 0, el
843                 el.parent = dest[0]
844                 open_els.unshift el
845                 return el
846
847         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
848         tree_insert_a_comment = (t) ->
849                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
850                 open_els[0].children.push t
851
852         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
853         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
854         generate_implied_end_tags = (except = null) ->
855                 while end_tag_implied[open_els[0]] and open_els[0].name isnt except
856                         open_els.shift()
857
858         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
859         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
860                 for node, i in open_els
861                         if node.name is name # FIXME check namespace too
862                                 generate_implied_end_tags name # arg is exception
863                                 parse_error() unless i is 0
864                                 while i >= 0
865                                         open_els.shift()
866                                         i -= 1
867                                 return
868                         if special_elements[node.name]? # FIXME check namespac too
869                                 parse_error()
870                                 return
871         tree_in_body = (t) ->
872                 switch t.type
873                         when TYPE_TEXT
874                                 switch t.text
875                                         when "\u0000"
876                                                 parse_error()
877                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
878                                                 reconstruct_active_formatting_elements()
879                                                 tree_insert_text t
880                                         else
881                                                 reconstruct_active_formatting_elements()
882                                                 tree_insert_text t
883                                                 flag_frameset_ok = false
884                         when TYPE_COMMENT
885                                 tree_insert_a_comment t
886                         when TYPE_DOCTYPE
887                                 parse_error()
888                         when TYPE_OPEN_TAG
889                                 switch t.name
890                                         when 'html'
891                                                 parse_error()
892                                                 return if template_tag_is_open()
893                                                 root_attrs = open_els[open_els.length - 1].attrs
894                                                 for k, v of t.attrs
895                                                         root_attrs[k] = v unless root_attrs[k]?
896                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
897                                                 # FIXME also do this for </template> (end tag)
898                                                 return tree_in_head t
899                                         when 'body'
900                                                 parse_error()
901                                                 # TODO
902                                         when 'frameset'
903                                                 parse_error()
904                                                 # TODO
905                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
906                                                 close_p_if_in_button_scope()
907                                                 insert_html_element t
908                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
909                                                 close_p_if_in_button_scope()
910                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
911                                                         parse_error()
912                                                         open_els.shift()
913                                                 insert_html_element t
914                                         # TODO lots more to implement here
915                                         when 'a'
916                                                 # If the list of active formatting elements
917                                                 # contains an a element between the end of the list and
918                                                 # the last marker on the list (or the start of the list
919                                                 # if there is no marker on the list), then this is a
920                                                 # parse error; run the adoption agency algorithm for
921                                                 # the tag name "a", then remove that element from the
922                                                 # list of active formatting elements and the stack of
923                                                 # open elements if the adoption agency algorithm didn't
924                                                 # already remove it (it might not have if the element
925                                                 # is not in table scope).
926                                                 found = false
927                                                 for el in afe
928                                                         if el.type is TYPE_MARKER
929                                                                 break
930                                                         if el.name is 'a'
931                                                                 found = el
932                                                 if found?
933                                                         parse_error()
934                                                         adoption_agency 'a'
935                                                         for el, i in afe
936                                                                 if el is found
937                                                                         afe.splice i, 1
938                                                         for el, i in open_els
939                                                                 if el is found
940                                                                         open_els.splice i, 1
941                                                 reconstruct_active_formatting_elements()
942                                                 el = tree_insert_element t
943                                                 afe.unshift el
944                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
945                                                 reconstruct_active_formatting_elements()
946                                                 el = tree_insert_element t
947                                                 afe.unshift el
948                                         # TODO lots more to implement here
949                                         else # any other start tag
950                                                 reconstruct_active_formatting_elements()
951                                                 tree_insert_element t
952                         when TYPE_EOF
953                                 ok_tags = {
954                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
955                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
956                                 }
957                                 for t in open_els
958                                         unless ok_tags[t.name]?
959                                                 parse_error()
960                                                 break
961                                 # TODO stack of template insertion modes thing
962                                 flag_parsing = false # stop parsing
963                         when TYPE_END_TAG
964                                 switch t.name
965                                         when 'body'
966                                                 unless is_in_scope 'body'
967                                                         parse_error()
968                                                         return
969                                                 # TODO implement parse error and move to tree_after_body
970                                         when 'html'
971                                                 unless is_in_scope 'body' # weird, but it's what the spec says
972                                                         parse_error()
973                                                         return
974                                                 # TODO implement parse error and move to tree_after_body, reprocess
975                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
976                                                 unless is_in_scope t.name, NS_HTML
977                                                         parse_error()
978                                                         return
979                                                 generate_implied_end_tags()
980                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
981                                                         parse_error()
982                                                 loop
983                                                         el = open_els.shift()
984                                                         if el.name is t.name and el.namespace is NS_HTML
985                                                                 return
986                                         # TODO lots more close tags to implement here
987                                         when 'p'
988                                                 unless is_in_button_scope 'p'
989                                                         parse_error()
990                                                         insert_html_element new_open_tag 'p'
991                                                         close_p_element()
992                                         # TODO lots more close tags to implement here
993                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
994                                                 adoption_agency t.name
995                                         # TODO lots more close tags to implement here
996                                         else
997                                                 in_body_any_other_end_tag t.name
998                 return
999
1000
1001         # the functions below implement the tokenizer stats described here:
1002         # http://www.w3.org/TR/html5/syntax.html#tokenization
1003
1004         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1005         tok_state_data = ->
1006                 switch c = txt.charAt(cur++)
1007                         when '&'
1008                                 return new_text_node tokenize_character_reference()
1009                         when '<'
1010                                 tok_state = tok_state_tag_open
1011                         when "\u0000"
1012                                 parse_error()
1013                                 return new_text_node c
1014                         when '' # EOF
1015                                 return new_eof_token()
1016                         else
1017                                 return new_text_node c
1018                 return null
1019
1020         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1021         # not needed: tok_state_character_reference_in_data = ->
1022         # just call tok_state_character_reference_in_data()
1023
1024         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1025         tok_state_tag_open = ->
1026                 switch c = txt.charAt(cur++)
1027                         when '!'
1028                                 tok_state = tok_state_markup_declaration_open
1029                         when '/'
1030                                 tok_state = tok_state_end_tag_open
1031                         when '?'
1032                                 parse_error()
1033                                 tok_state = tok_state_bogus_comment
1034                         else
1035                                 if lc_alpha.indexOf(c) > -1
1036                                         tok_cur_tag = new_open_tag c
1037                                         tok_state = tok_state_tag_name
1038                                 else if uc_alpha.indexOf(c) > -1
1039                                         tok_cur_tag = new_open_tag c.toLowerCase()
1040                                         tok_state = tok_state_tag_name
1041                                 else
1042                                         parse_error()
1043                                         tok_state = tok_state_data
1044                                         cur -= 1 # we didn't parse/handle the char after <
1045                                         return new_text_node '<'
1046                 return null
1047
1048         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1049         tok_state_end_tag_open = ->
1050                 switch c = txt.charAt(cur++)
1051                         when '>'
1052                                 parse_error()
1053                                 tok_state = tok_state_data
1054                         when '' # EOF
1055                                 parse_error()
1056                                 tok_state = tok_state_data
1057                                 return new_text_node '</'
1058                         else
1059                                 if uc_alpha.indexOf(c) > -1
1060                                         tok_cur_tag = new_end_tag c.toLowerCase()
1061                                         tok_state = tok_state_tag_name
1062                                 else if lc_alpha.indexOf(c) > -1
1063                                         tok_cur_tag = new_end_tag c
1064                                         tok_state = tok_state_tag_name
1065                                 else
1066                                         parse_error()
1067                                         tok_state = tok_state_bogus_comment
1068                 return null
1069
1070         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1071         tok_state_tag_name = ->
1072                 switch c = txt.charAt(cur++)
1073                         when "\t", "\n", "\u000c", ' '
1074                                 tok_state = tok_state_before_attribute_name
1075                         when '/'
1076                                 tok_state = tok_state_self_closing_start_tag
1077                         when '>'
1078                                 tok_state = tok_state_data
1079                                 tmp = tok_cur_tag
1080                                 tok_cur_tag = null
1081                                 return tmp
1082                         when "\u0000"
1083                                 parse_error()
1084                                 tok_cur_tag.name += "\ufffd"
1085                         when '' # EOF
1086                                 parse_error()
1087                                 tok_state = tok_state_data
1088                         else
1089                                 if uc_alpha.indexOf(c) > -1
1090                                         tok_cur_tag.name += c.toLowerCase()
1091                                 else
1092                                         tok_cur_tag.name += c
1093                 return null
1094
1095         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1096         tok_state_before_attribute_name = ->
1097                 attr_name = null
1098                 switch c = txt.charAt(cur++)
1099                         when "\t", "\n", "\u000c", ' '
1100                                 return null
1101                         when '/'
1102                                 tok_state = tok_state_self_closing_start_tag
1103                                 return null
1104                         when '>'
1105                                 tok_state = tok_state_data
1106                                 tmp = tok_cur_tag
1107                                 tok_cur_tag = null
1108                                 return tmp
1109                         when "\u0000"
1110                                 parse_error()
1111                                 attr_name = "\ufffd"
1112                         when '"', "'", '<', '='
1113                                 parse_error()
1114                                 attr_name = c
1115                         when '' # EOF
1116                                 parse_error()
1117                                 tok_state = tok_state_data
1118                         else
1119                                 if uc_alpha.indexOf(c) > -1
1120                                         attr_name = c.toLowerCase()
1121                                 else
1122                                         attr_name = c
1123                 if attr_name?
1124                         tok_cur_tag.attrs_a.unshift [attr_name, '']
1125                         tok_state = tok_state_attribute_name
1126                 return null
1127
1128         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1129         tok_state_attribute_name = ->
1130                 switch c = txt.charAt(cur++)
1131                         when "\t", "\n", "\u000c", ' '
1132                                 tok_state = tok_state_after_attribute_name
1133                         when '/'
1134                                 tok_state = tok_state_self_closing_start_tag
1135                         when '='
1136                                 tok_state = tok_state_before_attribute_value
1137                         when '>'
1138                                 tok_state = tok_state_data
1139                                 tmp = tok_cur_tag
1140                                 tok_cur_tag = null
1141                                 return tmp
1142                         when "\u0000"
1143                                 parse_error()
1144                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1145                         when '"', "'", '<'
1146                                 parse_error()
1147                                 tok_cur_tag.attrs_a[0][0] = c
1148                         when '' # EOF
1149                                 parse_error()
1150                                 tok_state = tok_state_data
1151                         else
1152                                 if uc_alpha.indexOf(c) > -1
1153                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1154                                 else
1155                                         tok_cur_tag.attrs_a[0][0] += c
1156                 return null
1157
1158         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1159         tok_state_before_attribute_value = ->
1160                 switch c = txt.charAt(cur++)
1161                         when "\t", "\n", "\u000c", ' '
1162                                 return null
1163                         when '"'
1164                                 tok_state = tok_state_attribute_value_double_quoted
1165                         when '&'
1166                                 tok_state = tok_state_attribute_value_unquoted
1167                                 cur -= 1
1168                         when "'"
1169                                 tok_state = tok_state_attribute_value_single_quoted
1170                         when "\u0000"
1171                                 # Parse error
1172                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1173                                 tok_state = tok_state_attribute_value_unquoted
1174                         when '>'
1175                                 # Parse error
1176                                 tok_state = tok_state_data
1177                                 tmp = tok_cur_tag
1178                                 tok_cur_tag = null
1179                                 return tmp
1180                         when '' # EOF
1181                                 parse_error()
1182                                 tok_state = tok_state_data
1183                         else
1184                                 tok_cur_tag.attrs_a[0][1] += c
1185                                 tok_state = tok_state_attribute_value_unquoted
1186                 return null
1187
1188         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1189         tok_state_attribute_value_double_quoted = ->
1190                 switch c = txt.charAt(cur++)
1191                         when '"'
1192                                 tok_state = tok_state_after_attribute_value_quoted
1193                         when '&'
1194                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1195                         when "\u0000"
1196                                 # Parse error
1197                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1198                         when '' # EOF
1199                                 parse_error()
1200                                 tok_state = tok_state_data
1201                         else
1202                                 tok_cur_tag.attrs_a[0][1] += c
1203                 return null
1204
1205         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1206         tok_state_attribute_value_single_quoted = ->
1207                 switch c = txt.charAt(cur++)
1208                         when "'"
1209                                 tok_state = tok_state_after_attribute_value_quoted
1210                         when '&'
1211                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1212                         when "\u0000"
1213                                 # Parse error
1214                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1215                         when '' # EOF
1216                                 parse_error()
1217                                 tok_state = tok_state_data
1218                         else
1219                                 tok_cur_tag.attrs_a[0][1] += c
1220                 return null
1221
1222         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1223         tok_state_attribute_value_unquoted = ->
1224                 switch c = txt.charAt(cur++)
1225                         when "\t", "\n", "\u000c", ' '
1226                                 tok_state = tok_state_before_attribute_name
1227                         when '&'
1228                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1229                         when '>'
1230                                 tok_state = tok_state_data
1231                                 tmp = tok_cur_tag
1232                                 tok_cur_tag = null
1233                                 return tmp
1234                         when "\u0000"
1235                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1236                         when '' # EOF
1237                                 parse_error()
1238                                 tok_state = tok_state_data
1239                         else
1240                                 # Parse Error if ', <, = or ` (backtick)
1241                                 tok_cur_tag.attrs_a[0][1] += c
1242                 return null
1243
1244         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1245         tok_state_after_attribute_value_quoted = ->
1246                 switch c = txt.charAt(cur++)
1247                         when "\t", "\n", "\u000c", ' '
1248                                 tok_state = tok_state_before_attribute_name
1249                         when '/'
1250                                 tok_state = tok_state_self_closing_start_tag
1251                         when '>'
1252                                 tok_state = tok_state_data
1253                                 tmp = tok_cur_tag
1254                                 tok_cur_tag = null
1255                                 return tmp
1256                         when '' # EOF
1257                                 parse_error()
1258                                 tok_state = tok_state_data
1259                         else
1260                                 # Parse Error
1261                                 tok_state = tok_state_before_attribute_name
1262                                 cur -= 1 # we didn't handle that char
1263                 return null
1264
1265         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1266         # Don't set this as a state, just call it
1267         # returns a string (NOT a text node)
1268         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1269                 if cur >= txt.length
1270                         return '&'
1271                 switch c = txt.charAt(cur)
1272                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1273                                 # explicitly not a parse error
1274                                 return '&'
1275                         when ';'
1276                                 # there has to be "one or more" alnums between & and ; to be a parse error
1277                                 return '&'
1278                         when '#'
1279                                 if cur + 1 >= txt.length
1280                                         return '&'
1281                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
1282                                         prefix = '#x'
1283                                         charset = hex_chars
1284                                         start = cur + 2
1285                                 else
1286                                         charset = digits
1287                                         start = cur + 1
1288                                         prefix = '#'
1289                                 i = 0
1290                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1291                                         i += 1
1292                                 if i is 0
1293                                         return '&'
1294                                 if txt.charAt(start + i) is ';'
1295                                         i += 1
1296                                 # FIXME This is supposed to generate parse errors for some chars
1297                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1298                                 if decoded?
1299                                         cur = start + i
1300                                         return decoded
1301                                 return '&'
1302                         else
1303                                 for i in [0...31]
1304                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
1305                                                 break
1306                                 if i is 0
1307                                         # exit early, because parse_error() below needs at least one alnum
1308                                         return '&'
1309                                 if txt.charAt(cur + i) is ';'
1310                                         i += 1 # include ';' terminator in value
1311                                         decoded = decode_named_char_ref txt.substr(cur, i)
1312                                         if decoded?
1313                                                 cur += i
1314                                                 return decoded
1315                                         parse_error()
1316                                         return '&'
1317                                 else
1318                                         # no ';' terminator (only legacy char refs)
1319                                         max = i
1320                                         for i in [2..max] # no prefix matches, so ok to check shortest first
1321                                                 c = legacy_char_refs[txt.substr(cur, i)]
1322                                                 if c?
1323                                                         if in_attr
1324                                                                 if txt.charAt(cur + i) is '='
1325                                                                         # "because some legacy user agents will
1326                                                                         # misinterpret the markup in those cases"
1327                                                                         parse_error()
1328                                                                         return '&'
1329                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
1330                                                                         # this makes attributes forgiving about url args
1331                                                                         return '&'
1332                                                         # ok, and besides the weird exceptions for attributes...
1333                                                         # return the matching char
1334                                                         cur += i # consume entity chars
1335                                                         parse_error() # because no terminating ";"
1336                                                         return c
1337                                         parse_error()
1338                                         return '&'
1339                 return # never reached
1340
1341         # tree constructor initialization
1342         # see comments on TYPE_TAG/etc for the structure of this data
1343         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1344         open_els = [tree]
1345         tree_state = tree_in_body
1346         flag_frameset_ok = true
1347         flag_parsing = true
1348         flag_foster_parenting = false
1349         afe = [] # active formatting elements
1350
1351         # tokenizer initialization
1352         tok_state = tok_state_data
1353
1354         # proccess input
1355         while flag_parsing
1356                 t = tok_state()
1357                 if t?
1358                         tree_state t
1359         return tree.children
1360
1361 # everything below is tests on the above
1362 test_equals = (description, output, expected_output) ->
1363         if output is expected_output
1364                 console.log "passed." # don't say name, so smart consoles can merge all of these
1365         else
1366                 console.log "FAILED: \"#{description}\""
1367                 console.log "   Expected: #{expected_output}"
1368                 console.log "     Actual: #{output}"
1369 serialize_els = (els, shallow, show_ids) ->
1370         serialized = ''
1371         sep = ''
1372         for t in els
1373                 serialized += sep
1374                 sep = ','
1375                 serialized += t.serialize shallow, show_ids
1376         return serialized
1377 test_parser = (args) ->
1378         debug_log_reset()
1379         parse_errors = []
1380         errors_cb = (i) ->
1381                 parse_errors.push i
1382         prev_node_id = 0 # reset counter
1383         parsed = parse_html args.html, errors_cb
1384         serialized = serialize_els parsed, false, false
1385         if serialized isnt args.expected # or parse_errors.length isnt args.errors
1386                 debug_log_each (str) ->
1387                         console.log str
1388                 console.log "FAILED: \"#{args.name}\""
1389         else
1390                 console.log "passed \"#{args.name}\""
1391         if serialized isnt args.expected
1392                 console.log "      Input: #{args.html}"
1393                 console.log "    Correct: #{args.expected}"
1394                 console.log "     Output: #{serialized}"
1395                 if parse_errors.length isnt args.errors
1396                         console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1397
1398 test_parser name: "empty", \
1399         html: "",
1400         expected: '',
1401         errors: 0
1402 test_parser name: "just text", \
1403         html: "abc",
1404         expected: 'text:"abc"',
1405         errors: 0
1406 test_parser name: "named entity", \
1407         html: "a&amp;1234",
1408         expected: 'text:"a&1234"',
1409         errors: 0
1410 test_parser name: "broken named character references", \
1411         html: "1&amp2&&amp;3&aabbcc;",
1412         expected: 'text:"1&2&&3&aabbcc;"',
1413         errors: 2
1414 test_parser name: "numbered entity overrides", \
1415         html: "1&#X80&#x80; &#x83",
1416         expected: 'text:"1€€ ƒ"',
1417         errors: 0
1418 test_parser name: "open tag", \
1419         html: "foo<span>bar",
1420         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1421         errors: 1 # no close tag
1422 test_parser name: "open tag with attributes", \
1423         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1424         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1425         errors: 1 # no close tag
1426 test_parser name: "open tag with attributes of various quotings", \
1427         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1428         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1429         errors: 1 # no close tag
1430 test_parser name: "attribute entity exceptions dq", \
1431         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1432         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1433         errors: 2 # no close tag, &amp= in attr
1434 test_parser name: "attribute entity exceptions sq", \
1435         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1436         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1437         errors: 2 # no close tag, &amp= in attr
1438 test_parser name: "attribute entity exceptions uq", \
1439         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1440         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1441         errors: 2 # no close tag, &amp= in attr
1442 test_parser name: "matching closing tags", \
1443         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1444         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1445         errors: 0
1446 test_parser name: "missing closing tag inside", \
1447         html: "foo<div>bar<span>baz</div>qux",
1448         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1449         errors: 1 # close tag mismatch
1450 test_parser name: "mis-matched closing tags", \
1451         html: "<span>12<div>34</span>56</div>78",
1452         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1453         errors: 2 # misplaced </span>, no </span> at the end
1454 test_parser name: "mis-matched formatting elements", \
1455         html: "12<b>34<i>56</b>78</i>90",
1456         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1457         errors: 1 # no idea how many their should be
1458 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1459         html: '<p>1<b>2<i>3</b>4</i>5</p>',
1460         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
1461         errors: 1
1462 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1463         html: '<b>1<p>2</b>3</p>',
1464         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
1465         errors: 1
1466 test_parser name: "crazy formatting elements test", \
1467         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1468         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1469         # firefox does this:
1470         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
1471         errors: 6 # no idea how many there should be
1472 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1473 test_parser name: "html5lib aaa 1", \
1474         html: '<a><p></a></p>',
1475         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
1476         errors: 2
1477 test_parser name: "html5lib aaa 2", \
1478         html: '<a>1<p>2</a>3</p>',
1479         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
1480         errors: 2
1481 test_parser name: "html5lib aaa 3", \
1482         html: '<a>1<button>2</a>3</button>',
1483         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
1484         errors: 2
1485 test_parser name: "html5lib aaa 4", \
1486         html: '<a>1<b>2</a>3</b>',
1487         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
1488         errors: 2
1489 test_parser name: "html5lib aaa 5", \
1490         html: '<a>1<div>2<div>3</a>4</div>5</div>',
1491         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',
1492         errors: 3