1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
36 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
37 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
39 alnum = lc_alpha + uc_alpha + digits
40 hex_chars = digits + "abcdefABCDEF"
42 # some SVG elements have dashes in them
43 tag_name_chars = alnum + "-"
45 # http://www.w3.org/TR/html5/infrastructure.html#space-character
46 space_chars = "\u0009\u000a\u000c\u000d\u0020"
48 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
49 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
51 # These are the character references that don't need a terminating semicolon
52 # min length: 2, max: 6, none are a prefix of any other.
54 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
55 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
56 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
57 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
58 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
59 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
60 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
61 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
62 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
63 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
64 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
65 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
66 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
67 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
68 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
69 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
70 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
74 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
75 raw_text_elements = ['script', 'style']
76 escapable_raw_text_elements = ['textarea', 'title']
77 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
79 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
80 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
81 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
82 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
83 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
84 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
85 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
86 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
87 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
88 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
89 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
90 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
91 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
92 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
96 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
98 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
99 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
100 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
101 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
102 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
103 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
104 'determinant', 'diff', 'divergence', 'divide', 'domain',
105 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
106 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
107 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
108 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
109 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
110 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
111 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
112 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
113 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
114 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
115 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
116 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
117 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
118 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
119 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
120 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
121 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
122 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
123 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
124 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
125 'vectorproduct', 'xor'
127 # foreign_elements = [svg_elements..., mathml_elements...]
128 #normal_elements = All other allowed HTML elements are normal elements.
131 # decode_named_char_ref()
133 # The list of named character references is _huge_ so ask the browser to decode
134 # for us instead of wasting bandwidth/space on including the table here.
136 # Pass without the "&" but with the ";" examples:
137 # for "&" pass "amp;"
138 # for "′" pass "x2032;"
141 textarea: document.createElement('textarea')
143 # TODO test this in IE8
144 decode_named_char_ref = (txt) ->
146 decoded = g_dncr.cache[txt]
147 return decoded if decoded?
148 g_dncr.textarea.innerHTML = txt
149 decoded = g_dncr.textarea.value
150 return null if decoded is txt
151 return g_dncr.cache[txt] = decoded
153 parse_html = (txt) ->
154 cur = 0 # index of next char in txt to be parsed
155 # declare tree and tokenizer variables so they're in scope below
157 tree_append_point = null
160 tok_cur_tag = null # partially parsed tag
163 # the functions below implement the tokenizer stats described here:
164 # http://www.w3.org/TR/html5/syntax.html#tokenization
166 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
168 switch c = txt.charAt(cur++)
170 tok_state = tok_state_character_reference_in_data
172 tok_state = tok_state_tag_open
175 return [TYPE_TEXT, c]
177 return [TYPE_TEXT, c]
180 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
181 tok_state_tag_open = ->
182 switch c = txt.charAt(cur++)
184 tok_state = tok_state_markup_declaration_open
186 tok_state = tok_state_end_tag_open
189 tok_state = tok_state_bogus_comment
191 if lc_alpha.indexOf(c) > -1
192 tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
193 tok_state = tok_state_tag_name
194 else if uc_alpha.indexOf(c) > -1
195 tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
196 tok_state = tok_state_tag_name
199 tok_state = tok_state_data
200 cur -= 1 # we didn't parse/handle the char after <
201 return [TYPE_TEXT, '<']
204 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
205 tok_state_tag_name = ->
206 switch c = txt.charAt(cur++)
207 when "\t", "\n", "\u000c", ' '
208 tok_state = tok_state_before_attribute_name
210 tok_state = tok_state_self_closing_start_tag
212 tok_state = tok_state_data
218 tok_cur_tag[1] += "\ufffd"
220 if uc_alpha.indexOf(c) > -1
221 tok_cur_tag[1] += c.toLowerCase()
226 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
227 tok_state_before_attribute_name = ->
229 switch c = txt.charAt(cur++)
230 when "\t", "\n", "\u000c", ' '
233 tok_state = tok_state_self_closing_start_tag
236 tok_state = tok_state_data
243 when '"', "'", '<', '='
247 if uc_alpha.indexOf(c) > -1
248 attr_name = c.toLowerCase()
252 tok_cur_tag[2].unshift [attr_name, '']
253 tok_state = tok_state_attribute_name
256 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
257 tok_state_attribute_name = ->
258 switch c = txt.charAt(cur++)
259 when "\t", "\n", "\u000c", ' '
260 tok_state = tok_state_after_attribute_name
262 tok_state = tok_state_self_closing_start_tag
264 tok_state = tok_state_before_attribute_value
266 tok_state = tok_state_data
272 tok_cur_tag[2][0][0] += "\ufffd"
274 if uc_alpha.indexOf(c) > -1
275 tok_cur_tag[2][0][0] += c.toLowerCase()
277 # Parse error if ", ' or <
278 tok_cur_tag[2][0][0] += c
281 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
282 tok_state_before_attribute_value = ->
283 switch c = txt.charAt(cur++)
284 when "\t", "\n", "\u000c", ' '
287 tok_state = tok_state_attribute_value_double_quoted
289 tok_state = tok_state_attribute_value_unquoted
292 tok_state = tok_state_attribute_value_single_quoted
295 tok_cur_tag[2][0][1] += "\ufffd"
296 tok_state = tok_state_attribute_value_unquoted
299 tok_state = tok_state_data
304 if uc_alpha.indexOf(c) > -1
305 tok_cur_tag[2][0][1] += c.toLowerCase()
307 # Parse error if ", ` or < (that's a backtick)
308 tok_cur_tag[2][0][1] += c
311 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
312 tok_state_attribute_value_double_quoted = ->
313 switch c = txt.charAt(cur++)
315 tok_state = tok_state_after_attribute_value_quoted
317 tok_state = tok_state_character_reference_in_attribute_value
318 tok_char_ref_addl_allowed = '"' # FIXME
321 tok_cur_tag[2][0][1] += "\ufffd"
322 tok_state = tok_state_attribute_value_unquoted
324 tok_cur_tag[2][0][1] += c
327 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
328 tok_state_after_attribute_value_quoted = ->
329 switch c = txt.charAt(cur++)
330 when "\t", "\n", "\u000c", ' '
331 tok_state = tok_state_before_attribute_name
333 tok_state = tok_state_self_closing_start_tag
335 tok_state = tok_state_data
341 tok_state = tok_state_before_attribute_name
342 cur -= 1 # we didn't handle that char
346 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
347 # & just got consumed
348 tok_state_character_reference_in_data = ->
349 tok_state = tok_state_data
351 return [TYPE_TEXT, '&']
352 switch c = txt.charAt(cur)
354 return [TYPE_TEXT, '&']
356 if cur + 1 >= txt.length
357 return [TYPE_TEXT, '&']
358 if txt.charAt(cur + 1).toLowerCase() is 'x'
367 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
370 return [TYPE_TEXT, '&']
371 if txt.charAt(start + i) is ';'
373 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
376 return [TYPE_TEXT, decoded]
377 return [TYPE_TEXT, '&']
380 if alnum.indexOf(txt.charAt(cur + i)) is -1
383 return [TYPE_TEXT, '&']
384 if txt.charAt(cur + i) is ';'
385 i += 1 # include ';' terminator in value
386 decoded = decode_named_char_ref txt.substr(cur, i)
389 return [TYPE_TEXT, decoded]
390 return [TYPE_TEXT, '&']
392 # no ';' terminator (only legacy char refs)
394 return [TYPE_TEXT, '&']
395 # FIXME: if we're inside an attribute:
396 # 1. don't parse refs that are followed by =
397 # 2. don't parse refs that are followed by alnum
399 for i in [2..max] # no prefix matches, so ok to check shortest first
400 c = legacy_char_refs[txt.substr(cur, i)]
402 cur += i # consume entity chars
403 return [TYPE_TEXT, c]
406 # the functions below impliment the Tree Contstruction algorithm here:
407 # http://www.w3.org/TR/html5/syntax.html#tree-construction
408 # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
410 if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
411 tree_append_point[tree_append_point.length - 1][1] += t[1]
413 tree_append_point.push t
414 if t[0] is TYPE_OPEN_TAG
421 tree_append_point = t[3]
423 # tree constructor initialization
424 tree = [] # see comments on TYPE_TAG/etc for the structure of this data
425 tree_append_point = tree
426 tree_state = tree_append
428 # tokenizer initialization
429 tok_state = tok_state_data
432 while cur < txt.length
439 # everything below is tests on the above
440 test_equals = (description, fn, args..., expected_output) ->
441 output = fn.apply this, args
442 if output is expected_output
443 console.log "passed: #{description}."
445 console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
446 html_to_json = (html) ->
447 return JSON.stringify parse_html html
448 test_equals "empty", html_to_json, "", '[]'
449 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
450 test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
451 test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
452 test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
453 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
454 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]'