1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an obect of the Node class. Here are the Node types:
26 TYPE_TAG = 0 # name, {attributes}, [children]
27 TYPE_TEXT = 1 # "text"
30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
32 TYPE_END_TAG = 5 # name
36 constructor: (type, args = {}) ->
37 @type = type # one of the TYPE_* constants above
38 @name = args.name ? '' # tag name
39 @text = args.text ? '' # contents for text/comment nodes
40 @attrs = args.attrs ? {}
41 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
42 @children = args.children ? []
43 serialize: -> # for unit tests
48 ret += JSON.stringify @name
50 ret += JSON.stringify @attrs
60 ret += JSON.stringify @text
63 ret += JSON.stringify @text
72 # helpers: (only take args that are normally known when parser creates nodes)
73 new_open_tag = (name) ->
74 return new Node TYPE_OPEN_TAG, name: name
75 new_end_tag = (name) ->
76 return new Node TYPE_END_TAG, name: name
77 new_text_node = (txt) ->
78 return new Node TYPE_TEXT, text: txt
79 new_comment_node = (txt) ->
80 return new Node TYPE_COMMENT, text: txt
82 return new Node TYPE_EOF
84 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
85 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
87 alnum = lc_alpha + uc_alpha + digits
88 hex_chars = digits + "abcdefABCDEF"
89 scopers = { # FIXME these are supposed to be namespace specific
90 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
91 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
92 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
93 'foreignObject': true, 'desc': true, 'title'
96 # some SVG elements have dashes in them
97 tag_name_chars = alnum + "-"
99 # http://www.w3.org/TR/html5/infrastructure.html#space-character
100 space_chars = "\u0009\u000a\u000c\u000d\u0020"
102 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
103 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
105 # These are the character references that don't need a terminating semicolon
106 # min length: 2, max: 6, none are a prefix of any other.
108 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
109 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
110 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
111 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
112 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
113 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
114 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
115 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
116 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
117 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
118 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
119 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
120 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
121 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
122 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
123 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
124 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
128 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
129 raw_text_elements = ['script', 'style']
130 escapable_raw_text_elements = ['textarea', 'title']
131 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
133 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
134 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
135 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
136 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
137 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
138 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
139 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
140 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
141 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
142 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
143 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
144 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
145 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
146 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
150 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
152 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
153 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
154 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
155 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
156 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
157 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
158 'determinant', 'diff', 'divergence', 'divide', 'domain',
159 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
160 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
161 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
162 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
163 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
164 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
165 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
166 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
167 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
168 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
169 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
170 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
171 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
172 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
173 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
174 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
175 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
176 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
177 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
178 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
179 'vectorproduct', 'xor'
181 # foreign_elements = [svg_elements..., mathml_elements...]
182 #normal_elements = All other allowed HTML elements are normal elements.
186 address: true, applet: true, area: true, article: true, aside: true,
187 base: true, basefont: true, bgsound: true, blockquote: true, body: true,
188 br: true, button: true, caption: true, center: true, col: true,
189 colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
190 dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
191 footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
192 h3: true, h4: true, h5: true, h6: true, head: true, header: true,
193 hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
194 isindex: true, li: true, link: true, listing: true, main: true,
195 marquee: true, meta: true, nav: true, noembed: true, noframes: true,
196 noscript: true, object: true, ol: true, p: true, param: true,
197 plaintext: true, pre: true, script: true, section: true, select: true,
198 source: true, style: true, summary: true, table: true, tbody: true,
199 td: true, template: true, textarea: true, tfoot: true, th: true,
200 thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
204 mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
207 foreignObject: true, desc: true, title: true
210 formatting_elements = {
211 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
212 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
217 # decode_named_char_ref()
219 # The list of named character references is _huge_ so ask the browser to decode
220 # for us instead of wasting bandwidth/space on including the table here.
222 # Pass without the "&" but with the ";" examples:
223 # for "&" pass "amp;"
224 # for "′" pass "x2032;"
227 textarea: document.createElement('textarea')
229 # TODO test this in IE8
230 decode_named_char_ref = (txt) ->
232 decoded = g_dncr.cache[txt]
233 return decoded if decoded?
234 g_dncr.textarea.innerHTML = txt
235 decoded = g_dncr.textarea.value
236 return null if decoded is txt
237 return g_dncr.cache[txt] = decoded
239 parse_html = (txt, parse_error_cb = null) ->
240 cur = 0 # index of next char in txt to be parsed
241 # declare tree and tokenizer variables so they're in scope below
243 open_tags = [] # stack of open elements
246 tok_cur_tag = null # partially parsed tag
247 flag_frameset_ok = null
254 console.log "Parse error at character #{cur} of #{txt.length}"
257 # the functions below impliment the Tree Contstruction algorithm
258 # http://www.w3.org/TR/html5/syntax.html#tree-construction
260 # But first... the helpers
261 template_tag_is_open = ->
263 if t.type is TYPE_TAG and t.name is 'template'
266 is_in_scope = (tag_name) ->
268 if t.name is tag_name
274 reconstruct_active_formatting_elements = ->
275 # FIXME implement this
277 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
278 # FIXME implement this
279 close_p_if_in_button_scope = ->
280 if open_tags[0].name is 'p'
283 #p = find_button_scope 'p'
285 # TODO generate_implied_end_tags except for p tags
286 # TODO parse_error unless open_tags[0].name is 'p'
287 # TODO pop stack until 'p' popped
291 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
292 tree_insert_a_character = (t) ->
293 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
294 dest = open_tags[0].children
295 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
296 dest[dest.length - 1].text += t.text
300 # FIXME read spec, do this right
301 # note: this assumes it's an open tag
302 tree_insert_tag = (t) ->
303 t.type = TYPE_TAG # not TYPE_OPEN_TAG
304 # convert attributes into a hash
305 while t.attrs_a.length
307 t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
308 open_tags[0].children.push t
311 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
312 tree_insert_a_comment = (t) ->
313 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
314 open_tags[0].children.push t
316 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
317 tree_in_body = (t) ->
323 when "\t", "\u000a", "\u000c", "\u000d", ' '
324 reconstruct_active_formatting_elements()
325 tree_insert_a_character t
327 reconstruct_active_formatting_elements()
328 tree_insert_a_character t
329 flag_frameset_ok = false
331 tree_insert_a_comment t
338 return if template_tag_is_open()
339 root_attrs = open_tags[open_tags.length - 1].children
341 root_attrs[k] = v unless root_attrs[k]?
342 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
343 # FIXME also do this for </template> (end tag)
344 return tree_in_head t
351 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
352 close_p_if_in_button_scope()
354 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
355 close_p_if_in_button_scope()
356 if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
360 # TODO lots more to implement here
361 else # any other start tag
362 reconstruct_active_formatting_elements()
366 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
367 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
370 unless ok_tags[t.name]?
373 # TODO stack of template insertion modes thing
374 flag_parsing = false # stop parsing
378 unless is_in_scope 'body'
381 # TODO implement parse error and move to tree_after_body
383 unless is_in_scope 'body' # weird, but it's what the spec says
386 # TODO implement parse error and move to tree_after_body, reprocess
387 # TODO lots more close tags to implement here
389 for node, i in open_tags
390 if node.name is t.name
391 # FIXME generate implied end tags except those with name==t.name
392 parse_error() unless i is 0
398 if special_elements[node.name]?
403 # the functions below implement the tokenizer stats described here:
404 # http://www.w3.org/TR/html5/syntax.html#tokenization
406 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
408 switch c = txt.charAt(cur++)
410 return new_text_node tokenize_character_reference()
412 tok_state = tok_state_tag_open
415 return new_text_node c
417 return new_eof_token()
419 return new_text_node c
422 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
423 # not needed: tok_state_character_reference_in_data = ->
424 # just call tok_state_character_reference_in_data()
426 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
427 tok_state_tag_open = ->
428 switch c = txt.charAt(cur++)
430 tok_state = tok_state_markup_declaration_open
432 tok_state = tok_state_end_tag_open
435 tok_state = tok_state_bogus_comment
437 if lc_alpha.indexOf(c) > -1
438 tok_cur_tag = new_open_tag c
439 tok_state = tok_state_tag_name
440 else if uc_alpha.indexOf(c) > -1
441 tok_cur_tag = new_open_tag c.toLowerCase()
442 tok_state = tok_state_tag_name
445 tok_state = tok_state_data
446 cur -= 1 # we didn't parse/handle the char after <
447 return new_text_node '<'
450 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
451 tok_state_end_tag_open = ->
452 switch c = txt.charAt(cur++)
455 tok_state = tok_state_data
458 tok_state = tok_state_data
459 return new_text_node '</'
461 if uc_alpha.indexOf(c) > -1
462 tok_cur_tag = new_end_tag c.toLowerCase()
463 tok_state = tok_state_tag_name
464 else if lc_alpha.indexOf(c) > -1
465 tok_cur_tag = new_end_tag c
466 tok_state = tok_state_tag_name
469 tok_state = tok_state_bogus_comment
472 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
473 tok_state_tag_name = ->
474 switch c = txt.charAt(cur++)
475 when "\t", "\n", "\u000c", ' '
476 tok_state = tok_state_before_attribute_name
478 tok_state = tok_state_self_closing_start_tag
480 tok_state = tok_state_data
486 tok_cur_tag.name += "\ufffd"
489 tok_state = tok_state_data
491 if uc_alpha.indexOf(c) > -1
492 tok_cur_tag.name += c.toLowerCase()
494 tok_cur_tag.name += c
497 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
498 tok_state_before_attribute_name = ->
500 switch c = txt.charAt(cur++)
501 when "\t", "\n", "\u000c", ' '
504 tok_state = tok_state_self_closing_start_tag
507 tok_state = tok_state_data
514 when '"', "'", '<', '='
519 tok_state = tok_state_data
521 if uc_alpha.indexOf(c) > -1
522 attr_name = c.toLowerCase()
526 tok_cur_tag.attrs_a.unshift [attr_name, '']
527 tok_state = tok_state_attribute_name
530 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
531 tok_state_attribute_name = ->
532 switch c = txt.charAt(cur++)
533 when "\t", "\n", "\u000c", ' '
534 tok_state = tok_state_after_attribute_name
536 tok_state = tok_state_self_closing_start_tag
538 tok_state = tok_state_before_attribute_value
540 tok_state = tok_state_data
546 tok_cur_tag.attrs_a[0][0] = "\ufffd"
549 tok_cur_tag.attrs_a[0][0] = c
552 tok_state = tok_state_data
554 if uc_alpha.indexOf(c) > -1
555 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
557 tok_cur_tag.attrs_a[0][0] += c
560 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
561 tok_state_before_attribute_value = ->
562 switch c = txt.charAt(cur++)
563 when "\t", "\n", "\u000c", ' '
566 tok_state = tok_state_attribute_value_double_quoted
568 tok_state = tok_state_attribute_value_unquoted
571 tok_state = tok_state_attribute_value_single_quoted
574 tok_cur_tag.attrs_a[0][1] += "\ufffd"
575 tok_state = tok_state_attribute_value_unquoted
578 tok_state = tok_state_data
584 tok_state = tok_state_data
586 tok_cur_tag.attrs_a[0][1] += c
587 tok_state = tok_state_attribute_value_unquoted
590 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
591 tok_state_attribute_value_double_quoted = ->
592 switch c = txt.charAt(cur++)
594 tok_state = tok_state_after_attribute_value_quoted
596 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
599 tok_cur_tag.attrs_a[0][1] += "\ufffd"
602 tok_state = tok_state_data
604 tok_cur_tag.attrs_a[0][1] += c
607 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
608 tok_state_attribute_value_single_quoted = ->
609 switch c = txt.charAt(cur++)
611 tok_state = tok_state_after_attribute_value_quoted
613 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
616 tok_cur_tag.attrs_a[0][1] += "\ufffd"
619 tok_state = tok_state_data
621 tok_cur_tag.attrs_a[0][1] += c
624 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
625 tok_state_attribute_value_unquoted = ->
626 switch c = txt.charAt(cur++)
627 when "\t", "\n", "\u000c", ' '
628 tok_state = tok_state_before_attribute_name
630 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
632 tok_state = tok_state_data
637 tok_cur_tag.attrs_a[0][1] += "\ufffd"
640 tok_state = tok_state_data
642 # Parse Error if ', <, = or ` (backtick)
643 tok_cur_tag.attrs_a[0][1] += c
646 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
647 tok_state_after_attribute_value_quoted = ->
648 switch c = txt.charAt(cur++)
649 when "\t", "\n", "\u000c", ' '
650 tok_state = tok_state_before_attribute_name
652 tok_state = tok_state_self_closing_start_tag
654 tok_state = tok_state_data
660 tok_state = tok_state_data
663 tok_state = tok_state_before_attribute_name
664 cur -= 1 # we didn't handle that char
667 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
668 # Don't set this as a state, just call it
669 # returns a string (NOT a text node)
670 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
673 switch c = txt.charAt(cur)
674 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
675 # explicitly not a parse error
678 # there has to be "one or more" alnums between & and ; to be a parse error
681 if cur + 1 >= txt.length
683 if txt.charAt(cur + 1).toLowerCase() is 'x'
692 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
696 if txt.charAt(start + i) is ';'
698 # FIXME This is supposed to generate parse errors for some chars
699 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
706 if alnum.indexOf(txt.charAt(cur + i)) is -1
709 # exit early, because parse_error() below needs at least one alnum
711 if txt.charAt(cur + i) is ';'
712 i += 1 # include ';' terminator in value
713 decoded = decode_named_char_ref txt.substr(cur, i)
720 # no ';' terminator (only legacy char refs)
722 for i in [2..max] # no prefix matches, so ok to check shortest first
723 c = legacy_char_refs[txt.substr(cur, i)]
726 if txt.charAt(cur + i) is '='
727 # "because some legacy user agents will
728 # misinterpret the markup in those cases"
731 if alnum.indexOf(txt.charAt(cur + i)) > -1
732 # this makes attributes forgiving about url args
734 # ok, and besides the weird exceptions for attributes...
735 # return the matching char
736 cur += i # consume entity chars
737 parse_error() # because no terminating ";"
741 return # never reached
743 # tree constructor initialization
744 # see comments on TYPE_TAG/etc for the structure of this data
745 tree = new Node TYPE_TAG, name: 'html'
747 tree_state = tree_in_body
748 flag_frameset_ok = true
751 # tokenizer initialization
752 tok_state = tok_state_data
761 # everything below is tests on the above
762 test_equals = (description, output, expected_output) ->
763 if output is expected_output
764 console.log "passed." # don't say name, so smart consoles can merge all of these
766 console.log "FAILED: \"#{description}\""
767 console.log " Expected: #{expected_output}"
768 console.log " Actual: #{output}"
769 test_parser = (args) ->
773 parsed = parse_html args.html, errors_cb
779 serialized += t.serialize()
780 if serialized isnt args.expected or parse_errors.length isnt args.errors
781 console.log "test FAILED: \"#{args.name}\""
783 console.log 'test passed'
784 if serialized isnt args.expected
785 console.log " Input: #{args.html}"
786 console.log " Correct: #{args.expected}"
787 console.log " Output: #{serialized}"
788 if parse_errors.length isnt args.errors
789 console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
791 test_parser name: "empty", \
795 test_parser name: "just text", \
797 expected: 'text:"abc"',
799 test_parser name: "named entity", \
801 expected: 'text:"a&1234"',
803 test_parser name: "broken named character references", \
804 html: "1&2&&3&aabbcc;",
805 expected: 'text:"1&2&&3&aabbcc;"',
807 test_parser name: "numbered entity overrides", \
808 html: "1€€ ƒ",
809 expected: 'text:"1€€ ƒ"',
811 test_parser name: "open tag", \
812 html: "foo<span>bar",
813 expected: 'text:"foo",tag:"span",{},[text:"bar"]',
814 errors: 1 # no close tag
815 test_parser name: "open tag with attributes", \
816 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
817 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
818 errors: 1 # no close tag
819 test_parser name: "open tag with attributes of various quotings", \
820 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
821 expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
822 errors: 1 # no close tag
823 test_parser name: "attribute entity exceptions dq", \
824 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
825 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
826 errors: 2 # no close tag, &= in attr
827 test_parser name: "attribute entity exceptions sq", \
828 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
829 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
830 errors: 2 # no close tag, &= in attr
831 test_parser name: "attribute entity exceptions uq", \
832 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
833 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
834 errors: 2 # no close tag, &= in attr
835 test_parser name: "matching closing tags", \
836 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
837 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
839 test_parser name: "mis-matched closing tags", \
840 html: "foo<div>bar<span>baz</div>qux",
841 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
842 errors: 1 # close tag mismatch