1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
34 alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789"
35 hex_chars = "0123456789abcdefABCDEF"
38 # some SVG elements have dashes in them
39 tag_name_chars = alnum + "-"
41 # http://www.w3.org/TR/html5/infrastructure.html#space-character
42 space_chars = "\u0009\u000a\u000c\u000d\u0020"
44 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
45 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
47 # These are the character references that don't need a terminating semicolon
48 # min length: 2, max: 6, none are a prefix of any other.
50 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
51 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
52 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
53 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
54 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
55 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
56 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
57 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
58 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
59 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
60 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
61 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
62 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
63 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
64 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
65 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
66 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
70 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
71 raw_text_elements = ['script', 'style']
72 escapable_raw_text_elements = ['textarea', 'title']
73 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
75 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
76 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
77 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
78 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
79 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
80 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
81 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
82 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
83 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
84 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
85 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
86 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
87 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
88 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
92 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
94 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
95 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
96 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
97 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
98 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
99 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
100 'determinant', 'diff', 'divergence', 'divide', 'domain',
101 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
102 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
103 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
104 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
105 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
106 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
107 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
108 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
109 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
110 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
111 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
112 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
113 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
114 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
115 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
116 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
117 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
118 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
119 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
120 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
121 'vectorproduct', 'xor'
123 # foreign_elements = [svg_elements..., mathml_elements...]
124 #normal_elements = All other allowed HTML elements are normal elements.
127 # decode_named_char_ref()
129 # The list of named character references is _huge_ so ask the browser to decode
130 # for us instead of wasting bandwidth/space on including the table here.
132 # Pass without the "&" but with the ";" examples:
133 # for "&" pass "amp;"
134 # for "′" pass "x2032;"
137 textarea: document.createElement('textarea')
139 # TODO test this in IE8
140 decode_named_char_ref = (txt) ->
142 decoded = g_dncr.cache[txt]
143 return decoded if decoded?
144 g_dncr.textarea.innerHTML = txt
145 decoded = g_dncr.textarea.value
146 return null if decoded is txt
147 return g_dncr.cache[txt] = decoded
149 parse_html = (txt) ->
150 cur = 0 # index of next char in txt to be parsed
151 # declare tree and tokenizer variables so they're in scope below
153 tree_append_point = null
158 # the functions below implement the tokenizer stats described here:
159 # http://www.w3.org/TR/html5/syntax.html#tokenization
164 switch c = txt.charAt(cur++)
166 tok_state = tok_state_character_reference_in_data
168 tok_state = tok_state_tag_open
171 return [TYPE_TEXT, c]
173 return [TYPE_TEXT, c]
176 # & just got consumed
177 tok_state_character_reference_in_data = ->
178 tok_state = tok_state_data
180 return [TYPE_TEXT, '&']
181 switch c = txt.charAt(cur)
183 return [TYPE_TEXT, '&']
185 if cur + 1 >= txt.length
186 return [TYPE_TEXT, '&']
187 if txt.charAt(cur + 1).toLowerCase() is 'x'
196 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
199 return [TYPE_TEXT, '&']
200 if txt.charAt(start + i) is ';'
202 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
205 return [TYPE_TEXT, decoded]
206 return [TYPE_TEXT, '&']
209 if alnum.indexOf(txt.charAt(cur + i)) is -1
212 return [TYPE_TEXT, '&']
213 if txt.charAt(cur + i) is ';'
214 i += 1 # include ';' terminator in value
215 decoded = decode_named_char_ref txt.substr(cur, i)
218 return [TYPE_TEXT, decoded]
219 return [TYPE_TEXT, '&']
221 # no ';' terminator (only legacy char refs)
223 return [TYPE_TEXT, '&']
224 # FIXME: if we're inside an attribute:
225 # 1. don't parse refs that are followed by =
226 # 2. don't parse refs that are followed by alnum
228 for i in [2..max] # no prefix matches, so ok to check shortest first
229 c = legacy_char_refs[txt.substr(cur, i)]
231 cur += i # consume entity chars
232 return [TYPE_TEXT, c]
236 # the functions below impliment the Tree Contstruction algorithm here:
237 # http://www.w3.org/TR/html5/syntax.html#tree-construction
239 if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
240 tree_append_point[tree_append_point.length - 1][1] += t[1]
242 tree_append_point.push t
244 # tree constructor initialization
245 tree = [] # see comments on TYPE_TAG/etc for the structure of this data
246 tree_append_point = tree
247 tree_state = tree_append
249 # tokenizer initialization
250 tok_state = tok_state_data
253 while cur < txt.length
260 # everything below is tests on the above
261 test_equals = (description, fn, args..., expected_output) ->
262 output = fn.apply this, args
263 if output is expected_output
264 console.log "passed: #{description}."
266 console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
267 html_to_json = (html) ->
268 return JSON.stringify parse_html html
269 test_equals "empty", html_to_json, "", '[]'
270 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
271 test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
272 test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
273 test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'