1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an obect of the Node class. Here are the Node types:
26 TYPE_TAG = 0 # name, {attributes}, [children]
27 TYPE_TEXT = 1 # "text"
30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
32 TYPE_END_TAG = 5 # name
36 constructor: (type, args = {}) ->
37 @type = type # one of the TYPE_* constants above
38 @name = args.name ? '' # tag name
39 @text = args.text ? '' # contents for text/comment nodes
40 @attrs = args.attrs ? {}
41 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
42 @children = args.children ? []
43 serialize: -> # for unit tests
48 ret += JSON.stringify @name
50 ret += JSON.stringify @attrs
60 ret += JSON.stringify @text
63 ret += JSON.stringify @text
72 # helpers: (only take args that are normally known when parser creates nodes)
73 new_open_tag = (name) ->
74 return new Node TYPE_OPEN_TAG, name: name
75 new_end_tag = (name) ->
76 return new Node TYPE_END_TAG, name: name
77 new_text_node = (txt) ->
78 return new Node TYPE_TEXT, text: txt
79 new_comment_node = (txt) ->
80 return new Node TYPE_COMMENT, text: txt
82 return new Node TYPE_EOF
84 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
85 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
87 alnum = lc_alpha + uc_alpha + digits
88 hex_chars = digits + "abcdefABCDEF"
90 # some SVG elements have dashes in them
91 tag_name_chars = alnum + "-"
93 # http://www.w3.org/TR/html5/infrastructure.html#space-character
94 space_chars = "\u0009\u000a\u000c\u000d\u0020"
96 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
97 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
99 # These are the character references that don't need a terminating semicolon
100 # min length: 2, max: 6, none are a prefix of any other.
102 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
103 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
104 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
105 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
106 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
107 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
108 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
109 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
110 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
111 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
112 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
113 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
114 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
115 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
116 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
117 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
118 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
122 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
123 raw_text_elements = ['script', 'style']
124 escapable_raw_text_elements = ['textarea', 'title']
125 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
127 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
128 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
129 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
130 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
131 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
132 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
133 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
134 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
135 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
136 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
137 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
138 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
139 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
140 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
144 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
146 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
147 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
148 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
149 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
150 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
151 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
152 'determinant', 'diff', 'divergence', 'divide', 'domain',
153 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
154 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
155 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
156 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
157 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
158 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
159 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
160 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
161 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
162 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
163 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
164 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
165 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
166 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
167 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
168 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
169 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
170 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
171 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
172 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
173 'vectorproduct', 'xor'
175 # foreign_elements = [svg_elements..., mathml_elements...]
176 #normal_elements = All other allowed HTML elements are normal elements.
180 address: true, applet: true, area: true, article: true, aside: true,
181 base: true, basefont: true, bgsound: true, blockquote: true, body: true,
182 br: true, button: true, caption: true, center: true, col: true,
183 colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
184 dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
185 footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
186 h3: true, h4: true, h5: true, h6: true, head: true, header: true,
187 hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
188 isindex: true, li: true, link: true, listing: true, main: true,
189 marquee: true, meta: true, nav: true, noembed: true, noframes: true,
190 noscript: true, object: true, ol: true, p: true, param: true,
191 plaintext: true, pre: true, script: true, section: true, select: true,
192 source: true, style: true, summary: true, table: true, tbody: true,
193 td: true, template: true, textarea: true, tfoot: true, th: true,
194 thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
198 mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
201 foreignObject: true, desc: true, title: true
204 formatting_elements = {
205 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
206 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
211 # decode_named_char_ref()
213 # The list of named character references is _huge_ so ask the browser to decode
214 # for us instead of wasting bandwidth/space on including the table here.
216 # Pass without the "&" but with the ";" examples:
217 # for "&" pass "amp;"
218 # for "′" pass "x2032;"
221 textarea: document.createElement('textarea')
223 # TODO test this in IE8
224 decode_named_char_ref = (txt) ->
226 decoded = g_dncr.cache[txt]
227 return decoded if decoded?
228 g_dncr.textarea.innerHTML = txt
229 decoded = g_dncr.textarea.value
230 return null if decoded is txt
231 return g_dncr.cache[txt] = decoded
233 parse_html = (txt, parse_error_cb = null) ->
234 cur = 0 # index of next char in txt to be parsed
235 # declare tree and tokenizer variables so they're in scope below
237 open_tags = [] # stack of open elements
240 tok_cur_tag = null # partially parsed tag
241 flag_frameset_ok = null
248 console.log "Parse error at character #{cur} of #{txt.length}"
251 # the functions below impliment the Tree Contstruction algorithm
252 # http://www.w3.org/TR/html5/syntax.html#tree-construction
254 # But first... the helpers
255 template_tag_is_open = ->
257 if t.type is TYPE_TAG and t.name is 'template'
260 is_in_scope_x = (tag_name, scope) ->
262 if t.name is tag_name
267 is_in_scope_x_y = (tag_name, scope, scope2) ->
269 if t.name is tag_name
276 standard_scopers = { # FIXME these are supposed to be namespace specific
277 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
278 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
279 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
280 'foreignObject': true, 'desc': true, 'title'
282 button_scopers = button: true
283 li_scopers = ol: true, ul: true
284 table_scopers = html: true, table: true, template: true
285 is_in_scope = (tag_name) ->
286 return is_in_scope_x tag_name, standard_scopers
287 is_in_button_scope = (tag_name) ->
288 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
289 is_in_table_scope = (tag_name) ->
290 return is_in_scope_x tag_name, table_scopers
291 is_in_select_scope = (tag_name) ->
293 if t.name is tag_name
295 if t.name isnt 'optgroup' and t.name isnt 'option'
299 reconstruct_active_formatting_elements = ->
300 # FIXME implement this
302 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
303 # FIXME implement this
304 close_p_if_in_button_scope = ->
305 if open_tags[0].name is 'p'
308 #p = find_button_scope 'p'
310 # TODO generate_implied_end_tags except for p tags
311 # TODO parse_error unless open_tags[0].name is 'p'
312 # TODO pop stack until 'p' popped
316 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
317 tree_insert_a_character = (t) ->
318 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
319 dest = open_tags[0].children
320 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
321 dest[dest.length - 1].text += t.text
325 # FIXME read spec, do this right
326 # note: this assumes it's an open tag
327 tree_insert_tag = (t) ->
328 t.type = TYPE_TAG # not TYPE_OPEN_TAG
329 # convert attributes into a hash
330 while t.attrs_a.length
332 t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
333 open_tags[0].children.push t
336 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
337 tree_insert_a_comment = (t) ->
338 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
339 open_tags[0].children.push t
341 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
342 tree_in_body = (t) ->
348 when "\t", "\u000a", "\u000c", "\u000d", ' '
349 reconstruct_active_formatting_elements()
350 tree_insert_a_character t
352 reconstruct_active_formatting_elements()
353 tree_insert_a_character t
354 flag_frameset_ok = false
356 tree_insert_a_comment t
363 return if template_tag_is_open()
364 root_attrs = open_tags[open_tags.length - 1].children
366 root_attrs[k] = v unless root_attrs[k]?
367 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
368 # FIXME also do this for </template> (end tag)
369 return tree_in_head t
376 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
377 close_p_if_in_button_scope()
379 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
380 close_p_if_in_button_scope()
381 if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
385 # TODO lots more to implement here
386 else # any other start tag
387 reconstruct_active_formatting_elements()
391 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
392 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
395 unless ok_tags[t.name]?
398 # TODO stack of template insertion modes thing
399 flag_parsing = false # stop parsing
403 unless is_in_scope 'body'
406 # TODO implement parse error and move to tree_after_body
408 unless is_in_scope 'body' # weird, but it's what the spec says
411 # TODO implement parse error and move to tree_after_body, reprocess
412 # TODO lots more close tags to implement here
414 for node, i in open_tags
415 if node.name is t.name
416 # FIXME generate implied end tags except those with name==t.name
417 parse_error() unless i is 0
423 if special_elements[node.name]?
428 # the functions below implement the tokenizer stats described here:
429 # http://www.w3.org/TR/html5/syntax.html#tokenization
431 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
433 switch c = txt.charAt(cur++)
435 return new_text_node tokenize_character_reference()
437 tok_state = tok_state_tag_open
440 return new_text_node c
442 return new_eof_token()
444 return new_text_node c
447 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
448 # not needed: tok_state_character_reference_in_data = ->
449 # just call tok_state_character_reference_in_data()
451 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
452 tok_state_tag_open = ->
453 switch c = txt.charAt(cur++)
455 tok_state = tok_state_markup_declaration_open
457 tok_state = tok_state_end_tag_open
460 tok_state = tok_state_bogus_comment
462 if lc_alpha.indexOf(c) > -1
463 tok_cur_tag = new_open_tag c
464 tok_state = tok_state_tag_name
465 else if uc_alpha.indexOf(c) > -1
466 tok_cur_tag = new_open_tag c.toLowerCase()
467 tok_state = tok_state_tag_name
470 tok_state = tok_state_data
471 cur -= 1 # we didn't parse/handle the char after <
472 return new_text_node '<'
475 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
476 tok_state_end_tag_open = ->
477 switch c = txt.charAt(cur++)
480 tok_state = tok_state_data
483 tok_state = tok_state_data
484 return new_text_node '</'
486 if uc_alpha.indexOf(c) > -1
487 tok_cur_tag = new_end_tag c.toLowerCase()
488 tok_state = tok_state_tag_name
489 else if lc_alpha.indexOf(c) > -1
490 tok_cur_tag = new_end_tag c
491 tok_state = tok_state_tag_name
494 tok_state = tok_state_bogus_comment
497 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
498 tok_state_tag_name = ->
499 switch c = txt.charAt(cur++)
500 when "\t", "\n", "\u000c", ' '
501 tok_state = tok_state_before_attribute_name
503 tok_state = tok_state_self_closing_start_tag
505 tok_state = tok_state_data
511 tok_cur_tag.name += "\ufffd"
514 tok_state = tok_state_data
516 if uc_alpha.indexOf(c) > -1
517 tok_cur_tag.name += c.toLowerCase()
519 tok_cur_tag.name += c
522 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
523 tok_state_before_attribute_name = ->
525 switch c = txt.charAt(cur++)
526 when "\t", "\n", "\u000c", ' '
529 tok_state = tok_state_self_closing_start_tag
532 tok_state = tok_state_data
539 when '"', "'", '<', '='
544 tok_state = tok_state_data
546 if uc_alpha.indexOf(c) > -1
547 attr_name = c.toLowerCase()
551 tok_cur_tag.attrs_a.unshift [attr_name, '']
552 tok_state = tok_state_attribute_name
555 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
556 tok_state_attribute_name = ->
557 switch c = txt.charAt(cur++)
558 when "\t", "\n", "\u000c", ' '
559 tok_state = tok_state_after_attribute_name
561 tok_state = tok_state_self_closing_start_tag
563 tok_state = tok_state_before_attribute_value
565 tok_state = tok_state_data
571 tok_cur_tag.attrs_a[0][0] = "\ufffd"
574 tok_cur_tag.attrs_a[0][0] = c
577 tok_state = tok_state_data
579 if uc_alpha.indexOf(c) > -1
580 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
582 tok_cur_tag.attrs_a[0][0] += c
585 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
586 tok_state_before_attribute_value = ->
587 switch c = txt.charAt(cur++)
588 when "\t", "\n", "\u000c", ' '
591 tok_state = tok_state_attribute_value_double_quoted
593 tok_state = tok_state_attribute_value_unquoted
596 tok_state = tok_state_attribute_value_single_quoted
599 tok_cur_tag.attrs_a[0][1] += "\ufffd"
600 tok_state = tok_state_attribute_value_unquoted
603 tok_state = tok_state_data
609 tok_state = tok_state_data
611 tok_cur_tag.attrs_a[0][1] += c
612 tok_state = tok_state_attribute_value_unquoted
615 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
616 tok_state_attribute_value_double_quoted = ->
617 switch c = txt.charAt(cur++)
619 tok_state = tok_state_after_attribute_value_quoted
621 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
624 tok_cur_tag.attrs_a[0][1] += "\ufffd"
627 tok_state = tok_state_data
629 tok_cur_tag.attrs_a[0][1] += c
632 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
633 tok_state_attribute_value_single_quoted = ->
634 switch c = txt.charAt(cur++)
636 tok_state = tok_state_after_attribute_value_quoted
638 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
641 tok_cur_tag.attrs_a[0][1] += "\ufffd"
644 tok_state = tok_state_data
646 tok_cur_tag.attrs_a[0][1] += c
649 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
650 tok_state_attribute_value_unquoted = ->
651 switch c = txt.charAt(cur++)
652 when "\t", "\n", "\u000c", ' '
653 tok_state = tok_state_before_attribute_name
655 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
657 tok_state = tok_state_data
662 tok_cur_tag.attrs_a[0][1] += "\ufffd"
665 tok_state = tok_state_data
667 # Parse Error if ', <, = or ` (backtick)
668 tok_cur_tag.attrs_a[0][1] += c
671 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
672 tok_state_after_attribute_value_quoted = ->
673 switch c = txt.charAt(cur++)
674 when "\t", "\n", "\u000c", ' '
675 tok_state = tok_state_before_attribute_name
677 tok_state = tok_state_self_closing_start_tag
679 tok_state = tok_state_data
685 tok_state = tok_state_data
688 tok_state = tok_state_before_attribute_name
689 cur -= 1 # we didn't handle that char
692 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
693 # Don't set this as a state, just call it
694 # returns a string (NOT a text node)
695 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
698 switch c = txt.charAt(cur)
699 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
700 # explicitly not a parse error
703 # there has to be "one or more" alnums between & and ; to be a parse error
706 if cur + 1 >= txt.length
708 if txt.charAt(cur + 1).toLowerCase() is 'x'
717 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
721 if txt.charAt(start + i) is ';'
723 # FIXME This is supposed to generate parse errors for some chars
724 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
731 if alnum.indexOf(txt.charAt(cur + i)) is -1
734 # exit early, because parse_error() below needs at least one alnum
736 if txt.charAt(cur + i) is ';'
737 i += 1 # include ';' terminator in value
738 decoded = decode_named_char_ref txt.substr(cur, i)
745 # no ';' terminator (only legacy char refs)
747 for i in [2..max] # no prefix matches, so ok to check shortest first
748 c = legacy_char_refs[txt.substr(cur, i)]
751 if txt.charAt(cur + i) is '='
752 # "because some legacy user agents will
753 # misinterpret the markup in those cases"
756 if alnum.indexOf(txt.charAt(cur + i)) > -1
757 # this makes attributes forgiving about url args
759 # ok, and besides the weird exceptions for attributes...
760 # return the matching char
761 cur += i # consume entity chars
762 parse_error() # because no terminating ";"
766 return # never reached
768 # tree constructor initialization
769 # see comments on TYPE_TAG/etc for the structure of this data
770 tree = new Node TYPE_TAG, name: 'html'
772 tree_state = tree_in_body
773 flag_frameset_ok = true
776 # tokenizer initialization
777 tok_state = tok_state_data
786 # everything below is tests on the above
787 test_equals = (description, output, expected_output) ->
788 if output is expected_output
789 console.log "passed." # don't say name, so smart consoles can merge all of these
791 console.log "FAILED: \"#{description}\""
792 console.log " Expected: #{expected_output}"
793 console.log " Actual: #{output}"
794 test_parser = (args) ->
798 parsed = parse_html args.html, errors_cb
804 serialized += t.serialize()
805 if serialized isnt args.expected or parse_errors.length isnt args.errors
806 console.log "FAILED: \"#{args.name}\""
808 console.log "passed \"#{args.name}\""
809 if serialized isnt args.expected
810 console.log " Input: #{args.html}"
811 console.log " Correct: #{args.expected}"
812 console.log " Output: #{serialized}"
813 if parse_errors.length isnt args.errors
814 console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
816 test_parser name: "empty", \
820 test_parser name: "just text", \
822 expected: 'text:"abc"',
824 test_parser name: "named entity", \
826 expected: 'text:"a&1234"',
828 test_parser name: "broken named character references", \
829 html: "1&2&&3&aabbcc;",
830 expected: 'text:"1&2&&3&aabbcc;"',
832 test_parser name: "numbered entity overrides", \
833 html: "1€€ ƒ",
834 expected: 'text:"1€€ ƒ"',
836 test_parser name: "open tag", \
837 html: "foo<span>bar",
838 expected: 'text:"foo",tag:"span",{},[text:"bar"]',
839 errors: 1 # no close tag
840 test_parser name: "open tag with attributes", \
841 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
842 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
843 errors: 1 # no close tag
844 test_parser name: "open tag with attributes of various quotings", \
845 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
846 expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
847 errors: 1 # no close tag
848 test_parser name: "attribute entity exceptions dq", \
849 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
850 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
851 errors: 2 # no close tag, &= in attr
852 test_parser name: "attribute entity exceptions sq", \
853 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
854 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
855 errors: 2 # no close tag, &= in attr
856 test_parser name: "attribute entity exceptions uq", \
857 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
858 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
859 errors: 2 # no close tag, &= in attr
860 test_parser name: "matching closing tags", \
861 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
862 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
864 test_parser name: "missing closing tag inside", \
865 html: "foo<div>bar<span>baz</div>qux",
866 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
867 errors: 1 # close tag mismatch
868 test_parser name: "mis-matched closing tags", \
869 html: "<span>12<div>34</span>56</div>78",
870 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
871 errors: 2 # misplaced </span>, no </span> at the end
872 test_parser name: "mis-matched formatting elements", \
873 html: "12<b>34<i>56</b>78</i>90",
874 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
875 errors: 2 # FIXME dunno how many there should be