JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse end tags, close tags with proper nesting
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
22 #
23 # Instead, the data structure produced by this parser is an array of nodes.
24 #
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
28
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
31 TYPE_WHITESPACE = 2
32 TYPE_COMMENT = 3
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
35 TYPE_CLOSE_TAG = 5 # name
36 TYPE_EOF = 6
37
38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
40 digits = "0123456789"
41 alnum = lc_alpha + uc_alpha + digits
42 hex_chars = digits + "abcdefABCDEF"
43
44 # some SVG elements have dashes in them
45 tag_name_chars = alnum + "-"
46
47 # http://www.w3.org/TR/html5/infrastructure.html#space-character
48 space_chars = "\u0009\u000a\u000c\u000d\u0020"
49
50 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
51 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
52
53 # These are the character references that don't need a terminating semicolon
54 # min length: 2, max: 6, none are a prefix of any other.
55 legacy_char_refs = {
56         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
57         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
58         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
59         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
60         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
61         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
62         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
63         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
64         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
65         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
66         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
67         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
68         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
69         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
70         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
71         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
72         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
73         yen: '¥', yuml: 'ÿ'
74 }
75
76 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
77 raw_text_elements = ['script', 'style']
78 escapable_raw_text_elements = ['textarea', 'title']
79 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
80 svg_elements = [
81         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
82         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
83         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
84         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
85         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
86         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
87         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
88         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
89         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
90         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
91         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
92         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
93         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
94         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
95         'view', 'vkern'
96 ]
97
98 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
99 mathml_elements = [
100         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
101         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
102         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
103         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
104         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
105         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
106         'determinant', 'diff', 'divergence', 'divide', 'domain',
107         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
108         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
109         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
110         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
111         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
112         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
113         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
114         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
115         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
116         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
117         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
118         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
119         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
120         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
121         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
122         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
123         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
124         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
125         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
126         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
127         'vectorproduct', 'xor'
128 ]
129 # foreign_elements = [svg_elements..., mathml_elements...]
130 #normal_elements = All other allowed HTML elements are normal elements.
131
132
133 # decode_named_char_ref()
134 #
135 # The list of named character references is _huge_ so ask the browser to decode
136 # for us instead of wasting bandwidth/space on including the table here.
137 #
138 # Pass without the "&" but with the ";" examples:
139 #    for "&amp" pass "amp;"
140 #    for "&#x2032" pass "x2032;"
141 g_dncr = {
142         cache: {}
143         textarea: document.createElement('textarea')
144 }
145 # TODO test this in IE8
146 decode_named_char_ref = (txt) ->
147         txt = "&#{txt}"
148         decoded = g_dncr.cache[txt]
149         return decoded if decoded?
150         g_dncr.textarea.innerHTML = txt
151         decoded = g_dncr.textarea.value
152         return null if decoded is txt
153         return g_dncr.cache[txt] = decoded
154
155 parse_html = (txt) ->
156         cur = 0 # index of next char in txt to be parsed
157         # declare tree and tokenizer variables so they're in scope below
158         tree = null
159         open_tags = [] # stack of open elements
160         tree_state = null
161         tok_state = null
162         tok_cur_tag = null # partially parsed tag
163
164         parse_error = ->
165                 console.log "Parse error at character #{cur} of #{txt.length}"
166
167         # the functions below implement the tokenizer stats described here:
168         # http://www.w3.org/TR/html5/syntax.html#tokenization
169
170         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
171         tok_state_data = ->
172                 switch c = txt.charAt(cur++)
173                         when '&'
174                                 return [TYPE_TEXT, tokenize_character_reference()]
175                         when '<'
176                                 tok_state = tok_state_tag_open
177                         when "\u0000"
178                                 parse_error()
179                                 return [TYPE_TEXT, c]
180                         when '' # EOF
181                                 return [TYPE_EOF]
182                         else
183                                 return [TYPE_TEXT, c]
184                 return null
185
186         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
187         # not needed: tok_state_character_reference_in_data = ->
188         # just call tok_state_character_reference_in_data()
189
190         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
191         tok_state_tag_open = ->
192                 switch c = txt.charAt(cur++)
193                         when '!'
194                                 tok_state = tok_state_markup_declaration_open
195                         when '/'
196                                 tok_state = tok_state_end_tag_open
197                         when '?'
198                                 parse_error()
199                                 tok_state = tok_state_bogus_comment
200                         else
201                                 if lc_alpha.indexOf(c) > -1
202                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
203                                         tok_state = tok_state_tag_name
204                                 else if uc_alpha.indexOf(c) > -1
205                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
206                                         tok_state = tok_state_tag_name
207                                 else
208                                         parse_error()
209                                         tok_state = tok_state_data
210                                         cur -= 1 # we didn't parse/handle the char after <
211                                         return [TYPE_TEXT, '<']
212                 return null
213
214         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
215         tok_state_end_tag_open = ->
216                 switch c = txt.charAt(cur++)
217                         when '>'
218                                 parse_error()
219                                 tok_state = tok_state_data
220                         when '' # EOF
221                                 parse_error()
222                                 tok_state = tok_state_data
223                                 return [TYPE_TEXT, '</']
224                         else
225                                 if uc_alpha.indexOf(c) > -1
226                                         tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
227                                         tok_state = tok_state_tag_name
228                                 else if lc_alpha.indexOf(c) > -1
229                                         tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
230                                         tok_state = tok_state_tag_name
231                                 else
232                                         parse_error()
233                                         tok_state = tok_state_bogus_comment
234                 return null
235
236         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
237         tok_state_tag_name = ->
238                 switch c = txt.charAt(cur++)
239                         when "\t", "\n", "\u000c", ' '
240                                 tok_state = tok_state_before_attribute_name
241                         when '/'
242                                 tok_state = tok_state_self_closing_start_tag
243                         when '>'
244                                 tok_state = tok_state_data
245                                 tmp = tok_cur_tag
246                                 tok_cur_tag = null
247                                 return tmp
248                         when "\u0000"
249                                 parse_error()
250                                 tok_cur_tag[1] += "\ufffd"
251                         when '' # EOF
252                                 parse_error()
253                                 tok_state = tok_state_data
254                         else
255                                 if uc_alpha.indexOf(c) > -1
256                                         tok_cur_tag[1] += c.toLowerCase()
257                                 else
258                                         tok_cur_tag[1] += c
259                 return null
260
261         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
262         tok_state_before_attribute_name = ->
263                 attr_name = null
264                 switch c = txt.charAt(cur++)
265                         when "\t", "\n", "\u000c", ' '
266                                 return null
267                         when '/'
268                                 tok_state = tok_state_self_closing_start_tag
269                                 return null
270                         when '>'
271                                 tok_state = tok_state_data
272                                 tmp = tok_cur_tag
273                                 tok_cur_tag = null
274                                 return tmp
275                         when "\u0000"
276                                 parse_error()
277                                 attr_name = "\ufffd"
278                         when '"', "'", '<', '='
279                                 parse_error()
280                                 attr_name = c
281                         when '' # EOF
282                                 parse_error()
283                                 tok_state = tok_state_data
284                         else
285                                 if uc_alpha.indexOf(c) > -1
286                                         attr_name = c.toLowerCase()
287                                 else
288                                         attr_name = c
289                 if attr_name?
290                         tok_cur_tag[2].unshift [attr_name, '']
291                         tok_state = tok_state_attribute_name
292                 return null
293
294         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
295         tok_state_attribute_name = ->
296                 switch c = txt.charAt(cur++)
297                         when "\t", "\n", "\u000c", ' '
298                                 tok_state = tok_state_after_attribute_name
299                         when '/'
300                                 tok_state = tok_state_self_closing_start_tag
301                         when '='
302                                 tok_state = tok_state_before_attribute_value
303                         when '>'
304                                 tok_state = tok_state_data
305                                 tmp = tok_cur_tag
306                                 tok_cur_tag = null
307                                 return tmp
308                         when "\u0000"
309                                 parse_error()
310                                 tok_cur_tag[2][0][0] = "\ufffd"
311                         when '"', "'", '<'
312                                 parse_error()
313                                 tok_cur_tag[2][0][0] = c
314                         when '' # EOF
315                                 parse_error()
316                                 tok_state = tok_state_data
317                         else
318                                 if uc_alpha.indexOf(c) > -1
319                                         tok_cur_tag[2][0][0] = c.toLowerCase()
320                                 else
321                                         tok_cur_tag[2][0][0] += c
322                 return null
323
324         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
325         tok_state_before_attribute_value = ->
326                 switch c = txt.charAt(cur++)
327                         when "\t", "\n", "\u000c", ' '
328                                 return null
329                         when '"'
330                                 tok_state = tok_state_attribute_value_double_quoted
331                         when '&'
332                                 tok_state = tok_state_attribute_value_unquoted
333                                 cur -= 1
334                         when "'"
335                                 tok_state = tok_state_attribute_value_single_quoted
336                         when "\u0000"
337                                 # Parse error
338                                 tok_cur_tag[2][0][1] += "\ufffd"
339                                 tok_state = tok_state_attribute_value_unquoted
340                         when '>'
341                                 # Parse error
342                                 tok_state = tok_state_data
343                                 tmp = tok_cur_tag
344                                 tok_cur_tag = null
345                                 return tmp
346                         when '' # EOF
347                                 parse_error()
348                                 tok_state = tok_state_data
349                         else
350                                 tok_cur_tag[2][0][1] += c
351                                 tok_state = tok_state_attribute_value_unquoted
352                 return null
353
354         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
355         tok_state_attribute_value_double_quoted = ->
356                 switch c = txt.charAt(cur++)
357                         when '"'
358                                 tok_state = tok_state_after_attribute_value_quoted
359                         when '&'
360                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
361                         when "\u0000"
362                                 # Parse error
363                                 tok_cur_tag[2][0][1] += "\ufffd"
364                         when '' # EOF
365                                 parse_error()
366                                 tok_state = tok_state_data
367                         else
368                                 tok_cur_tag[2][0][1] += c
369                 return null
370
371         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
372         tok_state_attribute_value_single_quoted = ->
373                 switch c = txt.charAt(cur++)
374                         when "'"
375                                 tok_state = tok_state_after_attribute_value_quoted
376                         when '&'
377                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
378                         when "\u0000"
379                                 # Parse error
380                                 tok_cur_tag[2][0][1] += "\ufffd"
381                         when '' # EOF
382                                 parse_error()
383                                 tok_state = tok_state_data
384                         else
385                                 tok_cur_tag[2][0][1] += c
386                 return null
387
388         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
389         tok_state_attribute_value_unquoted = ->
390                 switch c = txt.charAt(cur++)
391                         when "\t", "\n", "\u000c", ' '
392                                 tok_state = tok_state_before_attribute_name
393                         when '&'
394                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
395                         when '>'
396                                 tok_state = tok_state_data
397                                 tmp = tok_cur_tag
398                                 tok_cur_tag = null
399                                 return tmp
400                         when "\u0000"
401                                 tok_cur_tag[2][0][1] += "\ufffd"
402                         when '' # EOF
403                                 parse_error()
404                                 tok_state = tok_state_data
405                         else
406                                 # Parse Error if ', <, = or ` (backtick)
407                                 tok_cur_tag[2][0][1] += c
408                 return null
409
410         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
411         tok_state_after_attribute_value_quoted = ->
412                 switch c = txt.charAt(cur++)
413                         when "\t", "\n", "\u000c", ' '
414                                 tok_state = tok_state_before_attribute_name
415                         when '/'
416                                 tok_state = tok_state_self_closing_start_tag
417                         when '>'
418                                 tok_state = tok_state_data
419                                 tmp = tok_cur_tag
420                                 tok_cur_tag = null
421                                 return tmp
422                         when '' # EOF
423                                 parse_error()
424                                 tok_state = tok_state_data
425                         else
426                                 # Parse Error
427                                 tok_state = tok_state_before_attribute_name
428                                 cur -= 1 # we didn't handle that char
429                 return null
430
431         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
432         # Don't set this as a state, just call it
433         # returns a string (NOT a text node)
434         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
435                 if cur >= txt.length
436                         return '&'
437                 switch c = txt.charAt(cur)
438                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
439                                 # explicitly not a parse error
440                                 return '&'
441                         when ';'
442                                 # there has to be "one or more" alnums between & and ; to be a parse error
443                                 return '&'
444                         when '#'
445                                 if cur + 1 >= txt.length
446                                         return '&'
447                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
448                                         prefix = '#x'
449                                         charset = hex_chars
450                                         start = cur + 2
451                                 else
452                                         charset = digits
453                                         start = cur + 1
454                                         prefix = '#'
455                                 i = 0
456                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
457                                         i += 1
458                                 if i is 0
459                                         return '&'
460                                 if txt.charAt(start + i) is ';'
461                                         i += 1
462                                 # FIXME This is supposed to generate parse errors for some chars
463                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
464                                 if decoded?
465                                         cur = start + i
466                                         return decoded
467                                 return '&'
468                         else
469                                 for i in [0...31]
470                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
471                                                 break
472                                 if i is 0
473                                         # exit early, because parse_error() below needs at least one alnum
474                                         return '&'
475                                 if txt.charAt(cur + i) is ';'
476                                         i += 1 # include ';' terminator in value
477                                         decoded = decode_named_char_ref txt.substr(cur, i)
478                                         if decoded?
479                                                 cur += i
480                                                 return decoded
481                                         parse_error()
482                                         return '&'
483                                 else
484                                         # no ';' terminator (only legacy char refs)
485                                         max = i
486                                         for i in [2..max] # no prefix matches, so ok to check shortest first
487                                                 c = legacy_char_refs[txt.substr(cur, i)]
488                                                 if c?
489                                                         if in_attr
490                                                                 if txt.charAt(cur + i) is '='
491                                                                         # "because some legacy user agents will
492                                                                         # misinterpret the markup in those cases"
493                                                                         parse_error()
494                                                                         return '&'
495                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
496                                                                         # this makes attributes forgiving about url args
497                                                                         return '&'
498                                                         # ok, and besides the weird exceptions for attributes...
499                                                         # return the matching char
500                                                         cur += i # consume entity chars
501                                                         parse_error() # because no terminating ";"
502                                                         return c
503                                         parse_error()
504                                         return '&'
505                 return # never reached
506
507         # the functions below impliment the Tree Contstruction algorithm here:
508         # http://www.w3.org/TR/html5/syntax.html#tree-construction
509         # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
510         tree_append = (t) ->
511                 switch t[0]
512                         when TYPE_TEXT
513                                 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
514                                         open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
515                                 else
516                                         open_tags[0][3].push t
517                         when TYPE_OPEN_TAG
518                                 t[0] = TYPE_TAG
519                                 # convert attributes into a hash
520                                 attrs = {}
521                                 while t[2].length
522                                         a = t[2].pop()
523                                         attrs[a[0]] = a[1]
524                                 t[2] = attrs
525                                 # FIXME probs have to auto-close things first
526                                 open_tags[0][3].push t
527                                 open_tags.unshift t
528                                 # TODO implement formatting elements thing
529                         when TYPE_CLOSE_TAG
530                                 # FIXME this is just a hack for now
531                                 if open_tags.length < 2
532                                         parse_error()
533                                         return
534                                 if open_tags[0][1] isnt t[1]
535                                         parse_error()
536                                         # fall through and close something anyway
537                                 open_tags.shift()
538                         when TYPE_EOF
539                                 return
540                         # TODO implement close tags
541                         # TODO implement self-closing tags
542                         else
543                                 console.log "UNIMPLEMENTED tag type: #{t[0]}"
544                 return
545
546         # tree constructor initialization
547         # see comments on TYPE_TAG/etc for the structure of this data
548         tree = [TYPE_TAG, 'html', {}, []]
549         open_tags = [tree]
550         tree_state = tree_append
551
552         # tokenizer initialization
553         tok_state = tok_state_data
554
555         # proccess input
556         loop
557                 t = tok_state()
558                 if t?
559                         tree_state t
560                         if t[0] is TYPE_EOF
561                                 return tree[3]
562         return # never reached
563
564 # everything below is tests on the above
565 test_equals = (description, fn, args..., expected_output) ->
566         output = fn.apply this, args
567         if output is expected_output
568                 console.log "passed: #{description}."
569         else
570                 console.log "FAILED: #{description}..."
571                 console.log "   Expected: #{expected_output}"
572                 console.log "     Actual: #{output}"
573 html_to_json = (html) ->
574         return JSON.stringify parse_html html
575 test_equals "empty", html_to_json, "", '[]'
576 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
577 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
578 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
579 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
580 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
581 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
582 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
583 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
584 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
585 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
586 test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'