1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
35 TYPE_END_TAG = 5 # name
38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
41 alnum = lc_alpha + uc_alpha + digits
42 hex_chars = digits + "abcdefABCDEF"
43 scopers = { # FIXME these are supposed to be namespace specific
44 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
45 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
46 'foreignObject', 'desc', 'title'
49 # some SVG elements have dashes in them
50 tag_name_chars = alnum + "-"
52 # http://www.w3.org/TR/html5/infrastructure.html#space-character
53 space_chars = "\u0009\u000a\u000c\u000d\u0020"
55 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
56 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
58 # These are the character references that don't need a terminating semicolon
59 # min length: 2, max: 6, none are a prefix of any other.
61 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
62 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
63 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
64 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
65 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
66 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
67 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
68 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
69 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
70 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
71 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
72 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
73 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
74 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
75 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
76 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
77 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
81 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
82 raw_text_elements = ['script', 'style']
83 escapable_raw_text_elements = ['textarea', 'title']
84 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
86 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
87 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
88 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
89 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
90 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
91 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
92 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
93 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
94 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
95 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
96 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
97 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
98 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
99 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
103 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
105 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
106 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
107 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
108 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
109 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
110 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
111 'determinant', 'diff', 'divergence', 'divide', 'domain',
112 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
113 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
114 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
115 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
116 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
117 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
118 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
119 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
120 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
121 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
122 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
123 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
124 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
125 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
126 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
127 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
128 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
129 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
130 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
131 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
132 'vectorproduct', 'xor'
134 # foreign_elements = [svg_elements..., mathml_elements...]
135 #normal_elements = All other allowed HTML elements are normal elements.
139 address: true, applet: true, area: true, article: true, aside: true,
140 base: true, basefont: true, bgsound: true, blockquote: true, body: true,
141 br: true, button: true, caption: true, center: true, col: true,
142 colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
143 dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
144 footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
145 h3: true, h4: true, h5: true, h6: true, head: true, header: true,
146 hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
147 isindex: true, li: true, link: true, listing: true, main: true,
148 marquee: true, meta: true, nav: true, noembed: true, noframes: true,
149 noscript: true, object: true, ol: true, p: true, param: true,
150 plaintext: true, pre: true, script: true, section: true, select: true,
151 source: true, style: true, summary: true, table: true, tbody: true,
152 td: true, template: true, textarea: true, tfoot: true, th: true,
153 thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
157 mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
160 foreignObject: true, desc: true, title: true
163 formatting_elements = {
164 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
165 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
170 # decode_named_char_ref()
172 # The list of named character references is _huge_ so ask the browser to decode
173 # for us instead of wasting bandwidth/space on including the table here.
175 # Pass without the "&" but with the ";" examples:
176 # for "&" pass "amp;"
177 # for "′" pass "x2032;"
180 textarea: document.createElement('textarea')
182 # TODO test this in IE8
183 decode_named_char_ref = (txt) ->
185 decoded = g_dncr.cache[txt]
186 return decoded if decoded?
187 g_dncr.textarea.innerHTML = txt
188 decoded = g_dncr.textarea.value
189 return null if decoded is txt
190 return g_dncr.cache[txt] = decoded
192 parse_html = (txt) ->
193 cur = 0 # index of next char in txt to be parsed
194 # declare tree and tokenizer variables so they're in scope below
196 open_tags = [] # stack of open elements
199 tok_cur_tag = null # partially parsed tag
200 flag_frameset_ok = null
204 console.log "Parse error at character #{cur} of #{txt.length}"
207 # the functions below impliment the Tree Contstruction algorithm
208 # http://www.w3.org/TR/html5/syntax.html#tree-construction
210 # But first... the helpers
211 template_tag_is_open = ->
213 if t[0] is TYPE_TAG and t[1] is 'template'
216 is_in_scope = (tag_name) ->
218 if t[0] is TYPE_TAG and t[1] is tag_name
220 # FIXME bail if in scopers
223 reconstruct_active_formatting_elements = ->
224 # FIXME implement this
226 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
227 # FIXME implement this
228 close_p_if_in_button_scope = ->
229 if open_tags[0][1] is 'p' # FIXME
232 #p = find_button_scope 'p'
234 # TODO generate_implied_end_tags except for p tags
235 # TODO parse_error unless open_tags[0][1] is 'p'
236 # TODO pop stack until 'p' popped
240 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
241 tree_insert_a_character = (t) ->
242 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
243 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
244 open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
246 open_tags[0][3].push t
248 # FIXME read spec, do this right
249 # note: this assumes it's an open tag
250 tree_insert_tag = (t) ->
251 t[0] = TYPE_TAG # not TYPE_OPEN_TAG
252 # convert attributes into a hash
258 open_tags[0][3].push t
261 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
262 tree_insert_a_comment = (t) ->
263 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
264 open_tags[0][3].push t
266 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
267 tree_in_body = (t) ->
273 when "\t", "\u000a", "\u000c", "\u000d", ' '
274 reconstruct_active_formatting_elements()
275 tree_insert_a_character t
277 reconstruct_active_formatting_elements()
278 tree_insert_a_character t
279 flag_frameset_ok = false
281 tree_insert_a_comment t
288 return if template_tag_is_open()
289 root_attrs = open_tags[open_tags.length - 1][3]
291 root_attrs[k] = v unless root_attrs[k]?
292 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
293 # FIXME also do this for </template> (end tag)
294 return tree_in_head t
301 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
302 close_p_if_in_button_scope()
304 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
305 close_p_if_in_button_scope()
306 if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
310 # TODO lots more to implement here
311 else # any other start tag
312 reconstruct_active_formatting_elements()
316 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
317 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
320 unless ok_tags[t[1]]?
323 # TODO stack of template insertion modes thing
324 flag_parsing = false # stop parsing
328 unless is_in_scope 'body'
331 # TODO implement parse error and move to tree_after_body
333 unless is_in_scope 'body' # weird, but it's what the spec says
336 # TODO implement parse error and move to tree_after_body, reprocess
337 # TODO lots more close tags to implement here
339 for node, i in open_tags
341 # FIXME generate implied end tags except those with name==t[1]
342 parse_error() unless i is 0
348 if special_elements[node[1]]?
353 # the functions below implement the tokenizer stats described here:
354 # http://www.w3.org/TR/html5/syntax.html#tokenization
356 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
358 switch c = txt.charAt(cur++)
360 return [TYPE_TEXT, tokenize_character_reference()]
362 tok_state = tok_state_tag_open
365 return [TYPE_TEXT, c]
369 return [TYPE_TEXT, c]
372 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
373 # not needed: tok_state_character_reference_in_data = ->
374 # just call tok_state_character_reference_in_data()
376 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
377 tok_state_tag_open = ->
378 switch c = txt.charAt(cur++)
380 tok_state = tok_state_markup_declaration_open
382 tok_state = tok_state_end_tag_open
385 tok_state = tok_state_bogus_comment
387 if lc_alpha.indexOf(c) > -1
388 tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
389 tok_state = tok_state_tag_name
390 else if uc_alpha.indexOf(c) > -1
391 tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
392 tok_state = tok_state_tag_name
395 tok_state = tok_state_data
396 cur -= 1 # we didn't parse/handle the char after <
397 return [TYPE_TEXT, '<']
400 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
401 tok_state_end_tag_open = ->
402 switch c = txt.charAt(cur++)
405 tok_state = tok_state_data
408 tok_state = tok_state_data
409 return [TYPE_TEXT, '</']
411 if uc_alpha.indexOf(c) > -1
412 tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
413 tok_state = tok_state_tag_name
414 else if lc_alpha.indexOf(c) > -1
415 tok_cur_tag = [TYPE_END_TAG, c, [], []]
416 tok_state = tok_state_tag_name
419 tok_state = tok_state_bogus_comment
422 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
423 tok_state_tag_name = ->
424 switch c = txt.charAt(cur++)
425 when "\t", "\n", "\u000c", ' '
426 tok_state = tok_state_before_attribute_name
428 tok_state = tok_state_self_closing_start_tag
430 tok_state = tok_state_data
436 tok_cur_tag[1] += "\ufffd"
439 tok_state = tok_state_data
441 if uc_alpha.indexOf(c) > -1
442 tok_cur_tag[1] += c.toLowerCase()
447 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
448 tok_state_before_attribute_name = ->
450 switch c = txt.charAt(cur++)
451 when "\t", "\n", "\u000c", ' '
454 tok_state = tok_state_self_closing_start_tag
457 tok_state = tok_state_data
464 when '"', "'", '<', '='
469 tok_state = tok_state_data
471 if uc_alpha.indexOf(c) > -1
472 attr_name = c.toLowerCase()
476 tok_cur_tag[2].unshift [attr_name, '']
477 tok_state = tok_state_attribute_name
480 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
481 tok_state_attribute_name = ->
482 switch c = txt.charAt(cur++)
483 when "\t", "\n", "\u000c", ' '
484 tok_state = tok_state_after_attribute_name
486 tok_state = tok_state_self_closing_start_tag
488 tok_state = tok_state_before_attribute_value
490 tok_state = tok_state_data
496 tok_cur_tag[2][0][0] = "\ufffd"
499 tok_cur_tag[2][0][0] = c
502 tok_state = tok_state_data
504 if uc_alpha.indexOf(c) > -1
505 tok_cur_tag[2][0][0] = c.toLowerCase()
507 tok_cur_tag[2][0][0] += c
510 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
511 tok_state_before_attribute_value = ->
512 switch c = txt.charAt(cur++)
513 when "\t", "\n", "\u000c", ' '
516 tok_state = tok_state_attribute_value_double_quoted
518 tok_state = tok_state_attribute_value_unquoted
521 tok_state = tok_state_attribute_value_single_quoted
524 tok_cur_tag[2][0][1] += "\ufffd"
525 tok_state = tok_state_attribute_value_unquoted
528 tok_state = tok_state_data
534 tok_state = tok_state_data
536 tok_cur_tag[2][0][1] += c
537 tok_state = tok_state_attribute_value_unquoted
540 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
541 tok_state_attribute_value_double_quoted = ->
542 switch c = txt.charAt(cur++)
544 tok_state = tok_state_after_attribute_value_quoted
546 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
549 tok_cur_tag[2][0][1] += "\ufffd"
552 tok_state = tok_state_data
554 tok_cur_tag[2][0][1] += c
557 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
558 tok_state_attribute_value_single_quoted = ->
559 switch c = txt.charAt(cur++)
561 tok_state = tok_state_after_attribute_value_quoted
563 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
566 tok_cur_tag[2][0][1] += "\ufffd"
569 tok_state = tok_state_data
571 tok_cur_tag[2][0][1] += c
574 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
575 tok_state_attribute_value_unquoted = ->
576 switch c = txt.charAt(cur++)
577 when "\t", "\n", "\u000c", ' '
578 tok_state = tok_state_before_attribute_name
580 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
582 tok_state = tok_state_data
587 tok_cur_tag[2][0][1] += "\ufffd"
590 tok_state = tok_state_data
592 # Parse Error if ', <, = or ` (backtick)
593 tok_cur_tag[2][0][1] += c
596 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
597 tok_state_after_attribute_value_quoted = ->
598 switch c = txt.charAt(cur++)
599 when "\t", "\n", "\u000c", ' '
600 tok_state = tok_state_before_attribute_name
602 tok_state = tok_state_self_closing_start_tag
604 tok_state = tok_state_data
610 tok_state = tok_state_data
613 tok_state = tok_state_before_attribute_name
614 cur -= 1 # we didn't handle that char
617 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
618 # Don't set this as a state, just call it
619 # returns a string (NOT a text node)
620 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
623 switch c = txt.charAt(cur)
624 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
625 # explicitly not a parse error
628 # there has to be "one or more" alnums between & and ; to be a parse error
631 if cur + 1 >= txt.length
633 if txt.charAt(cur + 1).toLowerCase() is 'x'
642 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
646 if txt.charAt(start + i) is ';'
648 # FIXME This is supposed to generate parse errors for some chars
649 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
656 if alnum.indexOf(txt.charAt(cur + i)) is -1
659 # exit early, because parse_error() below needs at least one alnum
661 if txt.charAt(cur + i) is ';'
662 i += 1 # include ';' terminator in value
663 decoded = decode_named_char_ref txt.substr(cur, i)
670 # no ';' terminator (only legacy char refs)
672 for i in [2..max] # no prefix matches, so ok to check shortest first
673 c = legacy_char_refs[txt.substr(cur, i)]
676 if txt.charAt(cur + i) is '='
677 # "because some legacy user agents will
678 # misinterpret the markup in those cases"
681 if alnum.indexOf(txt.charAt(cur + i)) > -1
682 # this makes attributes forgiving about url args
684 # ok, and besides the weird exceptions for attributes...
685 # return the matching char
686 cur += i # consume entity chars
687 parse_error() # because no terminating ";"
691 return # never reached
693 # tree constructor initialization
694 # see comments on TYPE_TAG/etc for the structure of this data
695 tree = [TYPE_TAG, 'html', {}, []]
697 tree_state = tree_in_body
698 flag_frameset_ok = true
701 # tokenizer initialization
702 tok_state = tok_state_data
711 # everything below is tests on the above
712 test_equals = (description, fn, args..., expected_output) ->
713 output = fn.apply this, args
714 if output is expected_output
715 console.log "passed: #{description}."
717 console.log "FAILED: #{description}..."
718 console.log " Expected: #{expected_output}"
719 console.log " Actual: #{output}"
720 html_to_json = (html) ->
721 return JSON.stringify parse_html html
722 test_equals "empty", html_to_json, "", '[]'
723 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
724 test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
725 test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
726 test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
727 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
728 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
729 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
730 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
731 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
732 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
733 test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'
734 test_equals "mis-matched closing tags", html_to_json, "foo<div>bar<span>baz</div>qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'