1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
36 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
37 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
39 alnum = lc_alpha + uc_alpha + digits
40 hex_chars = digits + "abcdefABCDEF"
42 # some SVG elements have dashes in them
43 tag_name_chars = alnum + "-"
45 # http://www.w3.org/TR/html5/infrastructure.html#space-character
46 space_chars = "\u0009\u000a\u000c\u000d\u0020"
48 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
49 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
51 # These are the character references that don't need a terminating semicolon
52 # min length: 2, max: 6, none are a prefix of any other.
54 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
55 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
56 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
57 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
58 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
59 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
60 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
61 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
62 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
63 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
64 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
65 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
66 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
67 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
68 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
69 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
70 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
74 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
75 raw_text_elements = ['script', 'style']
76 escapable_raw_text_elements = ['textarea', 'title']
77 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
79 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
80 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
81 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
82 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
83 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
84 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
85 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
86 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
87 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
88 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
89 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
90 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
91 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
92 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
96 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
98 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
99 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
100 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
101 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
102 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
103 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
104 'determinant', 'diff', 'divergence', 'divide', 'domain',
105 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
106 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
107 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
108 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
109 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
110 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
111 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
112 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
113 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
114 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
115 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
116 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
117 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
118 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
119 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
120 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
121 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
122 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
123 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
124 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
125 'vectorproduct', 'xor'
127 # foreign_elements = [svg_elements..., mathml_elements...]
128 #normal_elements = All other allowed HTML elements are normal elements.
131 # decode_named_char_ref()
133 # The list of named character references is _huge_ so ask the browser to decode
134 # for us instead of wasting bandwidth/space on including the table here.
136 # Pass without the "&" but with the ";" examples:
137 # for "&" pass "amp;"
138 # for "′" pass "x2032;"
141 textarea: document.createElement('textarea')
143 # TODO test this in IE8
144 decode_named_char_ref = (txt) ->
146 decoded = g_dncr.cache[txt]
147 return decoded if decoded?
148 g_dncr.textarea.innerHTML = txt
149 decoded = g_dncr.textarea.value
150 return null if decoded is txt
151 return g_dncr.cache[txt] = decoded
153 parse_html = (txt) ->
154 cur = 0 # index of next char in txt to be parsed
155 # declare tree and tokenizer variables so they're in scope below
157 tree_append_point = null
160 tok_cur_tag = null # partially parsed tag
163 # the functions below implement the tokenizer stats described here:
164 # http://www.w3.org/TR/html5/syntax.html#tokenization
166 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
168 switch c = txt.charAt(cur++)
170 tok_state = tok_state_character_reference_in_data
172 tok_state = tok_state_tag_open
175 return [TYPE_TEXT, c]
177 return [TYPE_TEXT, c]
180 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
181 # & just got consumed
182 tok_state_character_reference_in_data = ->
183 tok_state = tok_state_data
185 return [TYPE_TEXT, '&']
186 switch c = txt.charAt(cur)
188 return [TYPE_TEXT, '&']
190 if cur + 1 >= txt.length
191 return [TYPE_TEXT, '&']
192 if txt.charAt(cur + 1).toLowerCase() is 'x'
201 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
204 return [TYPE_TEXT, '&']
205 if txt.charAt(start + i) is ';'
207 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
210 return [TYPE_TEXT, decoded]
211 return [TYPE_TEXT, '&']
214 if alnum.indexOf(txt.charAt(cur + i)) is -1
217 return [TYPE_TEXT, '&']
218 if txt.charAt(cur + i) is ';'
219 i += 1 # include ';' terminator in value
220 decoded = decode_named_char_ref txt.substr(cur, i)
223 return [TYPE_TEXT, decoded]
224 return [TYPE_TEXT, '&']
226 # no ';' terminator (only legacy char refs)
228 return [TYPE_TEXT, '&']
229 # FIXME: if we're inside an attribute:
230 # 1. don't parse refs that are followed by =
231 # 2. don't parse refs that are followed by alnum
233 for i in [2..max] # no prefix matches, so ok to check shortest first
234 c = legacy_char_refs[txt.substr(cur, i)]
236 cur += i # consume entity chars
237 return [TYPE_TEXT, c]
240 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
241 tok_state_tag_open = ->
242 switch c = txt.charAt(cur++)
244 tok_state = tok_state_markup_declaration_open
246 tok_state = tok_state_end_tag_open
249 tok_state = tok_state_bogus_comment
251 if lc_alpha.indexOf(c) > -1
252 tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
253 tok_state = tok_state_tag_name
254 else if uc_alpha.indexOf(c) > -1
255 tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
256 tok_state = tok_state_tag_name
259 tok_state = tok_state_data
260 cur -= 1 # we didn't parse/handle the char after <
261 return [TYPE_TEXT, '<']
264 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
265 tok_state_tag_name = ->
266 switch c = txt.charAt(cur++)
267 when "\t", "\n", "\u000c", ' '
268 tok_state = tok_state_before_attribute_name
270 tok_state = tok_state_self_closing_start_tag
272 tok_state = tok_state_data
278 tok_cur_tag[1] += "\ufffd"
280 if uc_alpha.indexOf(c) > -1
281 tok_cur_tag[1] += c.toLowerCase()
286 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
287 tok_state_before_attribute_name = ->
289 switch c = txt.charAt(cur++)
290 when "\t", "\n", "\u000c", ' '
293 tok_state = tok_state_self_closing_start_tag
296 tok_state = tok_state_data
303 when '"', "'", '<', '='
307 if uc_alpha.indexOf(c) > -1
308 attr_name = c.toLowerCase()
312 tok_cur_tag[2].unshift [attr_name, '']
313 tok_state = tok_state_attribute_name
316 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
317 tok_state_attribute_name = ->
318 switch c = txt.charAt(cur++)
319 when "\t", "\n", "\u000c", ' '
320 tok_state = tok_state_after_attribute_name
322 tok_state = tok_state_self_closing_start_tag
324 tok_state = tok_state_before_attribute_value
326 tok_state = tok_state_data
332 tok_cur_tag[2][0][0] += "\ufffd"
334 if uc_alpha.indexOf(c) > -1
335 tok_cur_tag[2][0][0] += c.toLowerCase()
337 # Parse error if ", ' or <
338 tok_cur_tag[2][0][0] += c
341 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
342 tok_state_before_attribute_value = ->
343 switch c = txt.charAt(cur++)
344 when "\t", "\n", "\u000c", ' '
347 tok_state = tok_state_attribute_value_double_quoted
349 tok_state = tok_state_attribute_value_unquoted
352 tok_state = tok_state_attribute_value_single_quoted
355 tok_cur_tag[2][0][1] += "\ufffd"
356 tok_state = tok_state_attribute_value_unquoted
359 tok_state = tok_state_data
364 tok_cur_tag[2][0][1] += c
365 tok_state = tok_state_attribute_value_unquoted
368 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
369 tok_state_attribute_value_double_quoted = ->
370 switch c = txt.charAt(cur++)
372 tok_state = tok_state_after_attribute_value_quoted
374 tok_state = tok_state_character_reference_in_attribute_value
375 tok_char_ref_addl_allowed = '"' # FIXME
378 tok_cur_tag[2][0][1] += "\ufffd"
380 tok_cur_tag[2][0][1] += c
383 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
384 tok_state_attribute_value_single_quoted = ->
385 switch c = txt.charAt(cur++)
387 tok_state = tok_state_after_attribute_value_quoted
389 tok_state = tok_state_character_reference_in_attribute_value
390 tok_char_ref_addl_allowed = "'" # FIXME
393 tok_cur_tag[2][0][1] += "\ufffd"
395 tok_cur_tag[2][0][1] += c
398 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
399 tok_state_attribute_value_unquoted = ->
400 switch c = txt.charAt(cur++)
401 when "\t", "\n", "\u000c", ' '
402 tok_state = tok_state_before_attribute_name
404 tok_state = tok_state_character_reference_in_attribute_value
405 tok_char_ref_addl_allowed = '>' # FIXME
407 tok_state = tok_state_data
412 tok_cur_tag[2][0][1] += "\ufffd"
414 # Parse Error if ', <, = or ` (backtick)
415 tok_cur_tag[2][0][1] += c
418 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
419 tok_state_after_attribute_value_quoted = ->
420 switch c = txt.charAt(cur++)
421 when "\t", "\n", "\u000c", ' '
422 tok_state = tok_state_before_attribute_name
424 tok_state = tok_state_self_closing_start_tag
426 tok_state = tok_state_data
432 tok_state = tok_state_before_attribute_name
433 cur -= 1 # we didn't handle that char
436 # the functions below impliment the Tree Contstruction algorithm here:
437 # http://www.w3.org/TR/html5/syntax.html#tree-construction
438 # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
440 if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
441 tree_append_point[tree_append_point.length - 1][1] += t[1]
443 tree_append_point.push t
444 if t[0] is TYPE_OPEN_TAG
451 tree_append_point = t[3]
453 # tree constructor initialization
454 tree = [] # see comments on TYPE_TAG/etc for the structure of this data
455 tree_append_point = tree
456 tree_state = tree_append
458 # tokenizer initialization
459 tok_state = tok_state_data
462 while cur < txt.length
469 # everything below is tests on the above
470 test_equals = (description, fn, args..., expected_output) ->
471 output = fn.apply this, args
472 if output is expected_output
473 console.log "passed: #{description}."
475 console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
476 html_to_json = (html) ->
477 return JSON.stringify parse_html html
478 test_equals "empty", html_to_json, "", '[]'
479 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
480 test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
481 test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
482 test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
483 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
484 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
485 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'