JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse_errors, EOF, &amp/etc in attrs
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
22 #
23 # Instead, the data structure produced by this parser is an array of nodes.
24 #
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
28
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
31 TYPE_WHITESPACE = 2
32 TYPE_COMMENT = 3
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
35 TYPE_EOF = 5
36
37 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
38 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
39 digits = "0123456789"
40 alnum = lc_alpha + uc_alpha + digits
41 hex_chars = digits + "abcdefABCDEF"
42
43 # some SVG elements have dashes in them
44 tag_name_chars = alnum + "-"
45
46 # http://www.w3.org/TR/html5/infrastructure.html#space-character
47 space_chars = "\u0009\u000a\u000c\u000d\u0020"
48
49 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
50 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
51
52 # These are the character references that don't need a terminating semicolon
53 # min length: 2, max: 6, none are a prefix of any other.
54 legacy_char_refs = {
55         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
56         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
57         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
58         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
59         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
60         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
61         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
62         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
63         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
64         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
65         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
66         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
67         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
68         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
69         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
70         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
71         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
72         yen: '¥', yuml: 'ÿ'
73 }
74
75 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
76 raw_text_elements = ['script', 'style']
77 escapable_raw_text_elements = ['textarea', 'title']
78 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
79 svg_elements = [
80         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
81         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
82         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
83         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
84         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
85         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
86         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
87         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
88         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
89         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
90         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
91         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
92         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
93         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
94         'view', 'vkern'
95 ]
96
97 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
98 mathml_elements = [
99         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
100         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
101         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
102         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
103         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
104         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
105         'determinant', 'diff', 'divergence', 'divide', 'domain',
106         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
107         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
108         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
109         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
110         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
111         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
112         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
113         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
114         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
115         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
116         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
117         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
118         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
119         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
120         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
121         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
122         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
123         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
124         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
125         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
126         'vectorproduct', 'xor'
127 ]
128 # foreign_elements = [svg_elements..., mathml_elements...]
129 #normal_elements = All other allowed HTML elements are normal elements.
130
131
132 # decode_named_char_ref()
133 #
134 # The list of named character references is _huge_ so ask the browser to decode
135 # for us instead of wasting bandwidth/space on including the table here.
136 #
137 # Pass without the "&" but with the ";" examples:
138 #    for "&amp" pass "amp;"
139 #    for "&#x2032" pass "x2032;"
140 g_dncr = {
141         cache: {}
142         textarea: document.createElement('textarea')
143 }
144 # TODO test this in IE8
145 decode_named_char_ref = (txt) ->
146         txt = "&#{txt}"
147         decoded = g_dncr.cache[txt]
148         return decoded if decoded?
149         g_dncr.textarea.innerHTML = txt
150         decoded = g_dncr.textarea.value
151         return null if decoded is txt
152         return g_dncr.cache[txt] = decoded
153
154 parse_html = (txt) ->
155         cur = 0 # index of next char in txt to be parsed
156         # declare tree and tokenizer variables so they're in scope below
157         tree = null
158         tree_append_point = null
159         tree_state = null
160         tok_state = null
161         tok_cur_tag = null # partially parsed tag
162
163         parse_error = ->
164                 console.log "Parse error at character #{cur} of #{txt.length}"
165
166         # the functions below implement the tokenizer stats described here:
167         # http://www.w3.org/TR/html5/syntax.html#tokenization
168
169         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
170         tok_state_data = ->
171                 switch c = txt.charAt(cur++)
172                         when '&'
173                                 return [TYPE_TEXT, tokenize_character_reference()]
174                         when '<'
175                                 tok_state = tok_state_tag_open
176                         when "\u0000"
177                                 parse_error()
178                                 return [TYPE_TEXT, c]
179                         when '' # EOF
180                                 return [TYPE_EOF]
181                         else
182                                 return [TYPE_TEXT, c]
183                 return null
184
185         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
186         # not needed: tok_state_character_reference_in_data = ->
187         # just call tok_state_character_reference_in_data()
188
189         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
190         tok_state_tag_open = ->
191                 switch c = txt.charAt(cur++)
192                         when '!'
193                                 tok_state = tok_state_markup_declaration_open
194                         when '/'
195                                 tok_state = tok_state_end_tag_open
196                         when '?'
197                                 parse_error()
198                                 tok_state = tok_state_bogus_comment
199                         else
200                                 if lc_alpha.indexOf(c) > -1
201                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
202                                         tok_state = tok_state_tag_name
203                                 else if uc_alpha.indexOf(c) > -1
204                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
205                                         tok_state = tok_state_tag_name
206                                 else
207                                         parse_error()
208                                         tok_state = tok_state_data
209                                         cur -= 1 # we didn't parse/handle the char after <
210                                         return [TYPE_TEXT, '<']
211                 return null
212
213         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
214         tok_state_tag_name = ->
215                 switch c = txt.charAt(cur++)
216                         when "\t", "\n", "\u000c", ' '
217                                 tok_state = tok_state_before_attribute_name
218                         when '/'
219                                 tok_state = tok_state_self_closing_start_tag
220                         when '>'
221                                 tok_state = tok_state_data
222                                 tmp = tok_cur_tag
223                                 tok_cur_tag = null
224                                 return tmp
225                         when "\u0000"
226                                 parse_error()
227                                 tok_cur_tag[1] += "\ufffd"
228                         when '' # EOF
229                                 parse_error()
230                                 tok_state = tok_state_data
231                         else
232                                 if uc_alpha.indexOf(c) > -1
233                                         tok_cur_tag[1] += c.toLowerCase()
234                                 else
235                                         tok_cur_tag[1] += c
236                 return null
237
238         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
239         tok_state_before_attribute_name = ->
240                 attr_name = null
241                 switch c = txt.charAt(cur++)
242                         when "\t", "\n", "\u000c", ' '
243                                 return null
244                         when '/'
245                                 tok_state = tok_state_self_closing_start_tag
246                                 return null
247                         when '>'
248                                 tok_state = tok_state_data
249                                 tmp = tok_cur_tag
250                                 tok_cur_tag = null
251                                 return tmp
252                         when "\u0000"
253                                 parse_error()
254                                 attr_name = "\ufffd"
255                         when '"', "'", '<', '='
256                                 parse_error()
257                                 attr_name = c
258                         when '' # EOF
259                                 parse_error()
260                                 tok_state = tok_state_data
261                         else
262                                 if uc_alpha.indexOf(c) > -1
263                                         attr_name = c.toLowerCase()
264                                 else
265                                         attr_name = c
266                 if attr_name?
267                         tok_cur_tag[2].unshift [attr_name, '']
268                         tok_state = tok_state_attribute_name
269                 return null
270
271         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
272         tok_state_attribute_name = ->
273                 switch c = txt.charAt(cur++)
274                         when "\t", "\n", "\u000c", ' '
275                                 tok_state = tok_state_after_attribute_name
276                         when '/'
277                                 tok_state = tok_state_self_closing_start_tag
278                         when '='
279                                 tok_state = tok_state_before_attribute_value
280                         when '>'
281                                 tok_state = tok_state_data
282                                 tmp = tok_cur_tag
283                                 tok_cur_tag = null
284                                 return tmp
285                         when "\u0000"
286                                 parse_error()
287                                 tok_cur_tag[2][0][0] = "\ufffd"
288                         when '"', "'", '<'
289                                 parse_error()
290                                 tok_cur_tag[2][0][0] = c
291                         when '' # EOF
292                                 parse_error()
293                                 tok_state = tok_state_data
294                         else
295                                 if uc_alpha.indexOf(c) > -1
296                                         tok_cur_tag[2][0][0] = c.toLowerCase()
297                                 else
298                                         tok_cur_tag[2][0][0] += c
299                 return null
300
301         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
302         tok_state_before_attribute_value = ->
303                 switch c = txt.charAt(cur++)
304                         when "\t", "\n", "\u000c", ' '
305                                 return null
306                         when '"'
307                                 tok_state = tok_state_attribute_value_double_quoted
308                         when '&'
309                                 tok_state = tok_state_attribute_value_unquoted
310                                 cur -= 1
311                         when "'"
312                                 tok_state = tok_state_attribute_value_single_quoted
313                         when "\u0000"
314                                 # Parse error
315                                 tok_cur_tag[2][0][1] += "\ufffd"
316                                 tok_state = tok_state_attribute_value_unquoted
317                         when '>'
318                                 # Parse error
319                                 tok_state = tok_state_data
320                                 tmp = tok_cur_tag
321                                 tok_cur_tag = null
322                                 return tmp
323                         when '' # EOF
324                                 parse_error()
325                                 tok_state = tok_state_data
326                         else
327                                 tok_cur_tag[2][0][1] += c
328                                 tok_state = tok_state_attribute_value_unquoted
329                 return null
330
331         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
332         tok_state_attribute_value_double_quoted = ->
333                 switch c = txt.charAt(cur++)
334                         when '"'
335                                 tok_state = tok_state_after_attribute_value_quoted
336                         when '&'
337                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
338                         when "\u0000"
339                                 # Parse error
340                                 tok_cur_tag[2][0][1] += "\ufffd"
341                         when '' # EOF
342                                 parse_error()
343                                 tok_state = tok_state_data
344                         else
345                                 tok_cur_tag[2][0][1] += c
346                 return null
347
348         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
349         tok_state_attribute_value_single_quoted = ->
350                 switch c = txt.charAt(cur++)
351                         when "'"
352                                 tok_state = tok_state_after_attribute_value_quoted
353                         when '&'
354                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
355                         when "\u0000"
356                                 # Parse error
357                                 tok_cur_tag[2][0][1] += "\ufffd"
358                         when '' # EOF
359                                 parse_error()
360                                 tok_state = tok_state_data
361                         else
362                                 tok_cur_tag[2][0][1] += c
363                 return null
364
365         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
366         tok_state_attribute_value_unquoted = ->
367                 switch c = txt.charAt(cur++)
368                         when "\t", "\n", "\u000c", ' '
369                                 tok_state = tok_state_before_attribute_name
370                         when '&'
371                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
372                         when '>'
373                                 tok_state = tok_state_data
374                                 tmp = tok_cur_tag
375                                 tok_cur_tag = null
376                                 return tmp
377                         when "\u0000"
378                                 tok_cur_tag[2][0][1] += "\ufffd"
379                         when '' # EOF
380                                 parse_error()
381                                 tok_state = tok_state_data
382                         else
383                                 # Parse Error if ', <, = or ` (backtick)
384                                 tok_cur_tag[2][0][1] += c
385                 return null
386
387         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
388         tok_state_after_attribute_value_quoted = ->
389                 switch c = txt.charAt(cur++)
390                         when "\t", "\n", "\u000c", ' '
391                                 tok_state = tok_state_before_attribute_name
392                         when '/'
393                                 tok_state = tok_state_self_closing_start_tag
394                         when '>'
395                                 tok_state = tok_state_data
396                                 tmp = tok_cur_tag
397                                 tok_cur_tag = null
398                                 return tmp
399                         when '' # EOF
400                                 parse_error()
401                                 tok_state = tok_state_data
402                         else
403                                 # Parse Error
404                                 tok_state = tok_state_before_attribute_name
405                                 cur -= 1 # we didn't handle that char
406                 return null
407
408         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
409         # Don't set this as a state, just call it
410         # returns a string (NOT a text node)
411         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
412                 if cur >= txt.length
413                         return '&'
414                 switch c = txt.charAt(cur)
415                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
416                                 # explicitly not a parse error
417                                 return '&'
418                         when ';'
419                                 # there has to be "one or more" alnums between & and ; to be a parse error
420                                 return '&'
421                         when '#'
422                                 if cur + 1 >= txt.length
423                                         return '&'
424                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
425                                         prefix = '#x'
426                                         charset = hex_chars
427                                         start = cur + 2
428                                 else
429                                         charset = digits
430                                         start = cur + 1
431                                         prefix = '#'
432                                 i = 0
433                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
434                                         i += 1
435                                 if i is 0
436                                         return '&'
437                                 if txt.charAt(start + i) is ';'
438                                         i += 1
439                                 # FIXME This is supposed to generate parse errors for some chars
440                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
441                                 if decoded?
442                                         cur = start + i
443                                         return decoded
444                                 return '&'
445                         else
446                                 for i in [0...31]
447                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
448                                                 break
449                                 if i is 0
450                                         # exit early, because parse_error() below needs at least one alnum
451                                         return '&'
452                                 if txt.charAt(cur + i) is ';'
453                                         i += 1 # include ';' terminator in value
454                                         decoded = decode_named_char_ref txt.substr(cur, i)
455                                         if decoded?
456                                                 cur += i
457                                                 return decoded
458                                         parse_error()
459                                         return '&'
460                                 else
461                                         # no ';' terminator (only legacy char refs)
462                                         max = i
463                                         for i in [2..max] # no prefix matches, so ok to check shortest first
464                                                 c = legacy_char_refs[txt.substr(cur, i)]
465                                                 if c?
466                                                         if in_attr
467                                                                 if txt.charAt(cur + i) is '='
468                                                                         # "because some legacy user agents will
469                                                                         # misinterpret the markup in those cases"
470                                                                         parse_error()
471                                                                         return '&'
472                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
473                                                                         # this makes attributes forgiving about url args
474                                                                         return '&'
475                                                         # ok, and besides the weird exceptions for attributes...
476                                                         # return the matching char
477                                                         cur += i # consume entity chars
478                                                         parse_error() # because no terminating ";"
479                                                         return c
480                                         parse_error()
481                                         return '&'
482                 return # never reached
483
484         # the functions below impliment the Tree Contstruction algorithm here:
485         # http://www.w3.org/TR/html5/syntax.html#tree-construction
486         # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
487         tree_append = (t) ->
488                 switch t[0]
489                         when TYPE_TEXT
490                                 if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
491                                         tree_append_point[tree_append_point.length - 1][1] += t[1]
492                                 else
493                                         tree_append_point.push t
494                         when TYPE_OPEN_TAG
495                                 t[0] = TYPE_TAG
496                                 # convert attributes into a hash
497                                 attrs = {}
498                                 while t[2].length
499                                         a = t[2].pop()
500                                         attrs[a[0]] = a[1]
501                                 t[2] = attrs
502                                 tree_append_point.push t
503                                 tree_append_point = t[3]
504                                 # TODO implement stack of open elements
505                                 # TODO implement formatting elements thing
506                         when TYPE_EOF
507                                 return
508                         # TODO implement close tags
509                         # TODO implement self-closing tags
510                         else
511                                 console.log "UNIMPLEMENTED tag type: #{t[0]}"
512
513         # tree constructor initialization
514         tree = [] # see comments on TYPE_TAG/etc for the structure of this data
515         tree_append_point = tree
516         tree_state = tree_append
517
518         # tokenizer initialization
519         tok_state = tok_state_data
520
521         # proccess input
522         loop
523                 t = tok_state()
524                 if t?
525                         tree_state t
526                         if t[0] is TYPE_EOF
527                                 return tree
528         return # never reached
529
530 # everything below is tests on the above
531 test_equals = (description, fn, args..., expected_output) ->
532         output = fn.apply this, args
533         if output is expected_output
534                 console.log "passed: #{description}."
535         else
536                 console.log "FAILED: #{description}..."
537                 console.log "   Expected: #{expected_output}"
538                 console.log "     Actual: #{output}"
539 html_to_json = (html) ->
540         return JSON.stringify parse_html html
541 test_equals "empty", html_to_json, "", '[]'
542 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
543 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
544 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
545 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
546 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
547 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
548 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
549 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
550 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
551 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'