1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
35 TYPE_END_TAG = 5 # name
38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
41 alnum = lc_alpha + uc_alpha + digits
42 hex_chars = digits + "abcdefABCDEF"
43 scopers = { # FIXME these are supposed to be namespace specific
44 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
45 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
46 'foreignObject', 'desc', 'title'
49 # some SVG elements have dashes in them
50 tag_name_chars = alnum + "-"
52 # http://www.w3.org/TR/html5/infrastructure.html#space-character
53 space_chars = "\u0009\u000a\u000c\u000d\u0020"
55 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
56 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
58 # These are the character references that don't need a terminating semicolon
59 # min length: 2, max: 6, none are a prefix of any other.
61 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
62 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
63 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
64 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
65 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
66 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
67 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
68 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
69 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
70 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
71 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
72 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
73 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
74 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
75 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
76 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
77 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
81 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
82 raw_text_elements = ['script', 'style']
83 escapable_raw_text_elements = ['textarea', 'title']
84 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
86 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
87 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
88 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
89 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
90 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
91 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
92 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
93 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
94 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
95 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
96 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
97 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
98 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
99 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
103 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
105 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
106 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
107 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
108 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
109 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
110 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
111 'determinant', 'diff', 'divergence', 'divide', 'domain',
112 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
113 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
114 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
115 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
116 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
117 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
118 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
119 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
120 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
121 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
122 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
123 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
124 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
125 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
126 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
127 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
128 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
129 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
130 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
131 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
132 'vectorproduct', 'xor'
134 # foreign_elements = [svg_elements..., mathml_elements...]
135 #normal_elements = All other allowed HTML elements are normal elements.
139 address: true, applet: true, area: true, article: true, aside: true,
140 base: true, basefont: true, bgsound: true, blockquote: true, body: true,
141 br: true, button: true, caption: true, center: true, col: true,
142 colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
143 dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
144 footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
145 h3: true, h4: true, h5: true, h6: true, head: true, header: true,
146 hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
147 isindex: true, li: true, link: true, listing: true, main: true,
148 marquee: true, meta: true, nav: true, noembed: true, noframes: true,
149 noscript: true, object: true, ol: true, p: true, param: true,
150 plaintext: true, pre: true, script: true, section: true, select: true,
151 source: true, style: true, summary: true, table: true, tbody: true,
152 td: true, template: true, textarea: true, tfoot: true, th: true,
153 thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
157 mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
160 foreignObject: true, desc: true, title: true
163 formatting_elements = {
164 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
165 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
170 # decode_named_char_ref()
172 # The list of named character references is _huge_ so ask the browser to decode
173 # for us instead of wasting bandwidth/space on including the table here.
175 # Pass without the "&" but with the ";" examples:
176 # for "&" pass "amp;"
177 # for "′" pass "x2032;"
180 textarea: document.createElement('textarea')
182 # TODO test this in IE8
183 decode_named_char_ref = (txt) ->
185 decoded = g_dncr.cache[txt]
186 return decoded if decoded?
187 g_dncr.textarea.innerHTML = txt
188 decoded = g_dncr.textarea.value
189 return null if decoded is txt
190 return g_dncr.cache[txt] = decoded
192 parse_html = (txt, parse_error_cb = null) ->
193 cur = 0 # index of next char in txt to be parsed
194 # declare tree and tokenizer variables so they're in scope below
196 open_tags = [] # stack of open elements
199 tok_cur_tag = null # partially parsed tag
200 flag_frameset_ok = null
207 console.log "Parse error at character #{cur} of #{txt.length}"
210 # the functions below impliment the Tree Contstruction algorithm
211 # http://www.w3.org/TR/html5/syntax.html#tree-construction
213 # But first... the helpers
214 template_tag_is_open = ->
216 if t[0] is TYPE_TAG and t[1] is 'template'
219 is_in_scope = (tag_name) ->
221 if t[0] is TYPE_TAG and t[1] is tag_name
223 # FIXME bail if in scopers
226 reconstruct_active_formatting_elements = ->
227 # FIXME implement this
229 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
230 # FIXME implement this
231 close_p_if_in_button_scope = ->
232 if open_tags[0][1] is 'p' # FIXME
235 #p = find_button_scope 'p'
237 # TODO generate_implied_end_tags except for p tags
238 # TODO parse_error unless open_tags[0][1] is 'p'
239 # TODO pop stack until 'p' popped
243 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
244 tree_insert_a_character = (t) ->
245 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
246 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
247 open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
249 open_tags[0][3].push t
251 # FIXME read spec, do this right
252 # note: this assumes it's an open tag
253 tree_insert_tag = (t) ->
254 t[0] = TYPE_TAG # not TYPE_OPEN_TAG
255 # convert attributes into a hash
261 open_tags[0][3].push t
264 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
265 tree_insert_a_comment = (t) ->
266 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
267 open_tags[0][3].push t
269 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
270 tree_in_body = (t) ->
276 when "\t", "\u000a", "\u000c", "\u000d", ' '
277 reconstruct_active_formatting_elements()
278 tree_insert_a_character t
280 reconstruct_active_formatting_elements()
281 tree_insert_a_character t
282 flag_frameset_ok = false
284 tree_insert_a_comment t
291 return if template_tag_is_open()
292 root_attrs = open_tags[open_tags.length - 1][3]
294 root_attrs[k] = v unless root_attrs[k]?
295 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
296 # FIXME also do this for </template> (end tag)
297 return tree_in_head t
304 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
305 close_p_if_in_button_scope()
307 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
308 close_p_if_in_button_scope()
309 if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
313 # TODO lots more to implement here
314 else # any other start tag
315 reconstruct_active_formatting_elements()
319 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
320 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
323 unless ok_tags[t[1]]?
326 # TODO stack of template insertion modes thing
327 flag_parsing = false # stop parsing
331 unless is_in_scope 'body'
334 # TODO implement parse error and move to tree_after_body
336 unless is_in_scope 'body' # weird, but it's what the spec says
339 # TODO implement parse error and move to tree_after_body, reprocess
340 # TODO lots more close tags to implement here
342 for node, i in open_tags
344 # FIXME generate implied end tags except those with name==t[1]
345 parse_error() unless i is 0
351 if special_elements[node[1]]?
356 # the functions below implement the tokenizer stats described here:
357 # http://www.w3.org/TR/html5/syntax.html#tokenization
359 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
361 switch c = txt.charAt(cur++)
363 return [TYPE_TEXT, tokenize_character_reference()]
365 tok_state = tok_state_tag_open
368 return [TYPE_TEXT, c]
372 return [TYPE_TEXT, c]
375 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
376 # not needed: tok_state_character_reference_in_data = ->
377 # just call tok_state_character_reference_in_data()
379 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
380 tok_state_tag_open = ->
381 switch c = txt.charAt(cur++)
383 tok_state = tok_state_markup_declaration_open
385 tok_state = tok_state_end_tag_open
388 tok_state = tok_state_bogus_comment
390 if lc_alpha.indexOf(c) > -1
391 tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
392 tok_state = tok_state_tag_name
393 else if uc_alpha.indexOf(c) > -1
394 tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
395 tok_state = tok_state_tag_name
398 tok_state = tok_state_data
399 cur -= 1 # we didn't parse/handle the char after <
400 return [TYPE_TEXT, '<']
403 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
404 tok_state_end_tag_open = ->
405 switch c = txt.charAt(cur++)
408 tok_state = tok_state_data
411 tok_state = tok_state_data
412 return [TYPE_TEXT, '</']
414 if uc_alpha.indexOf(c) > -1
415 tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
416 tok_state = tok_state_tag_name
417 else if lc_alpha.indexOf(c) > -1
418 tok_cur_tag = [TYPE_END_TAG, c, [], []]
419 tok_state = tok_state_tag_name
422 tok_state = tok_state_bogus_comment
425 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
426 tok_state_tag_name = ->
427 switch c = txt.charAt(cur++)
428 when "\t", "\n", "\u000c", ' '
429 tok_state = tok_state_before_attribute_name
431 tok_state = tok_state_self_closing_start_tag
433 tok_state = tok_state_data
439 tok_cur_tag[1] += "\ufffd"
442 tok_state = tok_state_data
444 if uc_alpha.indexOf(c) > -1
445 tok_cur_tag[1] += c.toLowerCase()
450 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
451 tok_state_before_attribute_name = ->
453 switch c = txt.charAt(cur++)
454 when "\t", "\n", "\u000c", ' '
457 tok_state = tok_state_self_closing_start_tag
460 tok_state = tok_state_data
467 when '"', "'", '<', '='
472 tok_state = tok_state_data
474 if uc_alpha.indexOf(c) > -1
475 attr_name = c.toLowerCase()
479 tok_cur_tag[2].unshift [attr_name, '']
480 tok_state = tok_state_attribute_name
483 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
484 tok_state_attribute_name = ->
485 switch c = txt.charAt(cur++)
486 when "\t", "\n", "\u000c", ' '
487 tok_state = tok_state_after_attribute_name
489 tok_state = tok_state_self_closing_start_tag
491 tok_state = tok_state_before_attribute_value
493 tok_state = tok_state_data
499 tok_cur_tag[2][0][0] = "\ufffd"
502 tok_cur_tag[2][0][0] = c
505 tok_state = tok_state_data
507 if uc_alpha.indexOf(c) > -1
508 tok_cur_tag[2][0][0] = c.toLowerCase()
510 tok_cur_tag[2][0][0] += c
513 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
514 tok_state_before_attribute_value = ->
515 switch c = txt.charAt(cur++)
516 when "\t", "\n", "\u000c", ' '
519 tok_state = tok_state_attribute_value_double_quoted
521 tok_state = tok_state_attribute_value_unquoted
524 tok_state = tok_state_attribute_value_single_quoted
527 tok_cur_tag[2][0][1] += "\ufffd"
528 tok_state = tok_state_attribute_value_unquoted
531 tok_state = tok_state_data
537 tok_state = tok_state_data
539 tok_cur_tag[2][0][1] += c
540 tok_state = tok_state_attribute_value_unquoted
543 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
544 tok_state_attribute_value_double_quoted = ->
545 switch c = txt.charAt(cur++)
547 tok_state = tok_state_after_attribute_value_quoted
549 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
552 tok_cur_tag[2][0][1] += "\ufffd"
555 tok_state = tok_state_data
557 tok_cur_tag[2][0][1] += c
560 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
561 tok_state_attribute_value_single_quoted = ->
562 switch c = txt.charAt(cur++)
564 tok_state = tok_state_after_attribute_value_quoted
566 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
569 tok_cur_tag[2][0][1] += "\ufffd"
572 tok_state = tok_state_data
574 tok_cur_tag[2][0][1] += c
577 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
578 tok_state_attribute_value_unquoted = ->
579 switch c = txt.charAt(cur++)
580 when "\t", "\n", "\u000c", ' '
581 tok_state = tok_state_before_attribute_name
583 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
585 tok_state = tok_state_data
590 tok_cur_tag[2][0][1] += "\ufffd"
593 tok_state = tok_state_data
595 # Parse Error if ', <, = or ` (backtick)
596 tok_cur_tag[2][0][1] += c
599 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
600 tok_state_after_attribute_value_quoted = ->
601 switch c = txt.charAt(cur++)
602 when "\t", "\n", "\u000c", ' '
603 tok_state = tok_state_before_attribute_name
605 tok_state = tok_state_self_closing_start_tag
607 tok_state = tok_state_data
613 tok_state = tok_state_data
616 tok_state = tok_state_before_attribute_name
617 cur -= 1 # we didn't handle that char
620 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
621 # Don't set this as a state, just call it
622 # returns a string (NOT a text node)
623 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
626 switch c = txt.charAt(cur)
627 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
628 # explicitly not a parse error
631 # there has to be "one or more" alnums between & and ; to be a parse error
634 if cur + 1 >= txt.length
636 if txt.charAt(cur + 1).toLowerCase() is 'x'
645 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
649 if txt.charAt(start + i) is ';'
651 # FIXME This is supposed to generate parse errors for some chars
652 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
659 if alnum.indexOf(txt.charAt(cur + i)) is -1
662 # exit early, because parse_error() below needs at least one alnum
664 if txt.charAt(cur + i) is ';'
665 i += 1 # include ';' terminator in value
666 decoded = decode_named_char_ref txt.substr(cur, i)
673 # no ';' terminator (only legacy char refs)
675 for i in [2..max] # no prefix matches, so ok to check shortest first
676 c = legacy_char_refs[txt.substr(cur, i)]
679 if txt.charAt(cur + i) is '='
680 # "because some legacy user agents will
681 # misinterpret the markup in those cases"
684 if alnum.indexOf(txt.charAt(cur + i)) > -1
685 # this makes attributes forgiving about url args
687 # ok, and besides the weird exceptions for attributes...
688 # return the matching char
689 cur += i # consume entity chars
690 parse_error() # because no terminating ";"
694 return # never reached
696 # tree constructor initialization
697 # see comments on TYPE_TAG/etc for the structure of this data
698 tree = [TYPE_TAG, 'html', {}, []]
700 tree_state = tree_in_body
701 flag_frameset_ok = true
704 # tokenizer initialization
705 tok_state = tok_state_data
714 # everything below is tests on the above
715 test_equals = (description, output, expected_output) ->
716 if output is expected_output
717 console.log "passed." # don't say name, so smart consoles can merge all of these
719 console.log "FAILED: \"#{description}\""
720 console.log " Expected: #{expected_output}"
721 console.log " Actual: #{output}"
722 test_parser = (args) ->
726 parsed = parse_html args.html, errors_cb
727 parsed = JSON.stringify parsed
728 if parsed isnt args.expected or parse_errors.length isnt args.errors
729 console.log "test FAILED: \"#{args.name}\""
731 console.log 'test passed'
732 if parsed isnt args.expected
733 console.log " Input: #{args.html}"
734 console.log " Correct: #{args.expected}"
735 console.log " Output: #{parsed}"
736 if parse_errors.length isnt args.errors
737 console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
739 test_parser name: "empty", \
743 test_parser name: "just text", \
745 expected: '[[1,"abc"]]',
747 test_parser name: "named entity", \
749 expected: '[[1,"a&1234"]]',
751 test_parser name: "broken named character references", \
752 html: "1&2&&3&aabbcc;",
753 expected: '[[1,"1&2&&3&aabbcc;"]]',
755 test_parser name: "numbered entity overrides", \
756 html: "1€€ ƒ",
757 expected: '[[1,"1€€ ƒ"]]',
759 test_parser name: "open tag", \
760 html: "foo<span>bar",
761 expected: '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]',
762 errors: 1 # no close tag
763 test_parser name: "open tag with attributes", \
764 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
765 expected: '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]',
766 errors: 1 # no close tag
767 test_parser name: "open tag with attributes of various quotings", \
768 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
769 expected: '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]',
770 errors: 1 # no close tag
771 test_parser name: "attribute entity exceptions dq", \
772 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
773 expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]',
774 errors: 2 # no close tag, &= in attr
775 test_parser name: "attribute entity exceptions sq", \
776 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
777 expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]',
778 errors: 2 # no close tag, &= in attr
779 test_parser name: "attribute entity exceptions uq", \
780 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
781 expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]',
782 errors: 2 # no close tag, &= in attr
783 test_parser name: "matching closing tags", \
784 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
785 expected: '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]',
787 test_parser name: "mis-matched closing tags", \
788 html: "foo<div>bar<span>baz</div>qux",
789 expected: '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]',
790 errors: 1 # close tag mismatch