1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
37 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
38 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
40 alnum = lc_alpha + uc_alpha + digits
41 hex_chars = digits + "abcdefABCDEF"
43 # some SVG elements have dashes in them
44 tag_name_chars = alnum + "-"
46 # http://www.w3.org/TR/html5/infrastructure.html#space-character
47 space_chars = "\u0009\u000a\u000c\u000d\u0020"
49 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
50 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
52 # These are the character references that don't need a terminating semicolon
53 # min length: 2, max: 6, none are a prefix of any other.
55 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
56 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
57 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
58 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
59 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
60 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
61 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
62 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
63 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
64 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
65 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
66 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
67 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
68 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
69 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
70 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
71 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
75 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
76 raw_text_elements = ['script', 'style']
77 escapable_raw_text_elements = ['textarea', 'title']
78 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
80 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
81 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
82 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
83 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
84 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
85 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
86 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
87 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
88 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
89 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
90 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
91 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
92 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
93 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
97 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
99 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
100 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
101 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
102 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
103 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
104 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
105 'determinant', 'diff', 'divergence', 'divide', 'domain',
106 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
107 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
108 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
109 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
110 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
111 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
112 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
113 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
114 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
115 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
116 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
117 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
118 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
119 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
120 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
121 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
122 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
123 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
124 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
125 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
126 'vectorproduct', 'xor'
128 # foreign_elements = [svg_elements..., mathml_elements...]
129 #normal_elements = All other allowed HTML elements are normal elements.
132 # decode_named_char_ref()
134 # The list of named character references is _huge_ so ask the browser to decode
135 # for us instead of wasting bandwidth/space on including the table here.
137 # Pass without the "&" but with the ";" examples:
138 # for "&" pass "amp;"
139 # for "′" pass "x2032;"
142 textarea: document.createElement('textarea')
144 # TODO test this in IE8
145 decode_named_char_ref = (txt) ->
147 decoded = g_dncr.cache[txt]
148 return decoded if decoded?
149 g_dncr.textarea.innerHTML = txt
150 decoded = g_dncr.textarea.value
151 return null if decoded is txt
152 return g_dncr.cache[txt] = decoded
154 parse_html = (txt) ->
155 cur = 0 # index of next char in txt to be parsed
156 # declare tree and tokenizer variables so they're in scope below
158 tree_append_point = null
161 tok_cur_tag = null # partially parsed tag
164 console.log "Parse error at character #{cur} of #{txt.length}"
166 # the functions below implement the tokenizer stats described here:
167 # http://www.w3.org/TR/html5/syntax.html#tokenization
169 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
171 switch c = txt.charAt(cur++)
173 return [TYPE_TEXT, tokenize_character_reference()]
175 tok_state = tok_state_tag_open
178 return [TYPE_TEXT, c]
182 return [TYPE_TEXT, c]
185 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
186 # not needed: tok_state_character_reference_in_data = ->
187 # just call tok_state_character_reference_in_data()
189 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
190 tok_state_tag_open = ->
191 switch c = txt.charAt(cur++)
193 tok_state = tok_state_markup_declaration_open
195 tok_state = tok_state_end_tag_open
198 tok_state = tok_state_bogus_comment
200 if lc_alpha.indexOf(c) > -1
201 tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
202 tok_state = tok_state_tag_name
203 else if uc_alpha.indexOf(c) > -1
204 tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
205 tok_state = tok_state_tag_name
208 tok_state = tok_state_data
209 cur -= 1 # we didn't parse/handle the char after <
210 return [TYPE_TEXT, '<']
213 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
214 tok_state_tag_name = ->
215 switch c = txt.charAt(cur++)
216 when "\t", "\n", "\u000c", ' '
217 tok_state = tok_state_before_attribute_name
219 tok_state = tok_state_self_closing_start_tag
221 tok_state = tok_state_data
227 tok_cur_tag[1] += "\ufffd"
230 tok_state = tok_state_data
232 if uc_alpha.indexOf(c) > -1
233 tok_cur_tag[1] += c.toLowerCase()
238 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
239 tok_state_before_attribute_name = ->
241 switch c = txt.charAt(cur++)
242 when "\t", "\n", "\u000c", ' '
245 tok_state = tok_state_self_closing_start_tag
248 tok_state = tok_state_data
255 when '"', "'", '<', '='
260 tok_state = tok_state_data
262 if uc_alpha.indexOf(c) > -1
263 attr_name = c.toLowerCase()
267 tok_cur_tag[2].unshift [attr_name, '']
268 tok_state = tok_state_attribute_name
271 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
272 tok_state_attribute_name = ->
273 switch c = txt.charAt(cur++)
274 when "\t", "\n", "\u000c", ' '
275 tok_state = tok_state_after_attribute_name
277 tok_state = tok_state_self_closing_start_tag
279 tok_state = tok_state_before_attribute_value
281 tok_state = tok_state_data
287 tok_cur_tag[2][0][0] = "\ufffd"
290 tok_cur_tag[2][0][0] = c
293 tok_state = tok_state_data
295 if uc_alpha.indexOf(c) > -1
296 tok_cur_tag[2][0][0] = c.toLowerCase()
298 tok_cur_tag[2][0][0] += c
301 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
302 tok_state_before_attribute_value = ->
303 switch c = txt.charAt(cur++)
304 when "\t", "\n", "\u000c", ' '
307 tok_state = tok_state_attribute_value_double_quoted
309 tok_state = tok_state_attribute_value_unquoted
312 tok_state = tok_state_attribute_value_single_quoted
315 tok_cur_tag[2][0][1] += "\ufffd"
316 tok_state = tok_state_attribute_value_unquoted
319 tok_state = tok_state_data
325 tok_state = tok_state_data
327 tok_cur_tag[2][0][1] += c
328 tok_state = tok_state_attribute_value_unquoted
331 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
332 tok_state_attribute_value_double_quoted = ->
333 switch c = txt.charAt(cur++)
335 tok_state = tok_state_after_attribute_value_quoted
337 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
340 tok_cur_tag[2][0][1] += "\ufffd"
343 tok_state = tok_state_data
345 tok_cur_tag[2][0][1] += c
348 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
349 tok_state_attribute_value_single_quoted = ->
350 switch c = txt.charAt(cur++)
352 tok_state = tok_state_after_attribute_value_quoted
354 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
357 tok_cur_tag[2][0][1] += "\ufffd"
360 tok_state = tok_state_data
362 tok_cur_tag[2][0][1] += c
365 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
366 tok_state_attribute_value_unquoted = ->
367 switch c = txt.charAt(cur++)
368 when "\t", "\n", "\u000c", ' '
369 tok_state = tok_state_before_attribute_name
371 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
373 tok_state = tok_state_data
378 tok_cur_tag[2][0][1] += "\ufffd"
381 tok_state = tok_state_data
383 # Parse Error if ', <, = or ` (backtick)
384 tok_cur_tag[2][0][1] += c
387 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
388 tok_state_after_attribute_value_quoted = ->
389 switch c = txt.charAt(cur++)
390 when "\t", "\n", "\u000c", ' '
391 tok_state = tok_state_before_attribute_name
393 tok_state = tok_state_self_closing_start_tag
395 tok_state = tok_state_data
401 tok_state = tok_state_data
404 tok_state = tok_state_before_attribute_name
405 cur -= 1 # we didn't handle that char
408 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
409 # Don't set this as a state, just call it
410 # returns a string (NOT a text node)
411 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
414 switch c = txt.charAt(cur)
415 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
416 # explicitly not a parse error
419 # there has to be "one or more" alnums between & and ; to be a parse error
422 if cur + 1 >= txt.length
424 if txt.charAt(cur + 1).toLowerCase() is 'x'
433 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
437 if txt.charAt(start + i) is ';'
439 # FIXME This is supposed to generate parse errors for some chars
440 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
447 if alnum.indexOf(txt.charAt(cur + i)) is -1
450 # exit early, because parse_error() below needs at least one alnum
452 if txt.charAt(cur + i) is ';'
453 i += 1 # include ';' terminator in value
454 decoded = decode_named_char_ref txt.substr(cur, i)
461 # no ';' terminator (only legacy char refs)
463 for i in [2..max] # no prefix matches, so ok to check shortest first
464 c = legacy_char_refs[txt.substr(cur, i)]
467 if txt.charAt(cur + i) is '='
468 # "because some legacy user agents will
469 # misinterpret the markup in those cases"
472 if alnum.indexOf(txt.charAt(cur + i)) > -1
473 # this makes attributes forgiving about url args
475 # ok, and besides the weird exceptions for attributes...
476 # return the matching char
477 cur += i # consume entity chars
478 parse_error() # because no terminating ";"
482 return # never reached
484 # the functions below impliment the Tree Contstruction algorithm here:
485 # http://www.w3.org/TR/html5/syntax.html#tree-construction
486 # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
490 if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
491 tree_append_point[tree_append_point.length - 1][1] += t[1]
493 tree_append_point.push t
496 # convert attributes into a hash
502 tree_append_point.push t
503 tree_append_point = t[3]
504 # TODO implement stack of open elements
505 # TODO implement formatting elements thing
508 # TODO implement close tags
509 # TODO implement self-closing tags
511 console.log "UNIMPLEMENTED tag type: #{t[0]}"
513 # tree constructor initialization
514 tree = [] # see comments on TYPE_TAG/etc for the structure of this data
515 tree_append_point = tree
516 tree_state = tree_append
518 # tokenizer initialization
519 tok_state = tok_state_data
528 return # never reached
530 # everything below is tests on the above
531 test_equals = (description, fn, args..., expected_output) ->
532 output = fn.apply this, args
533 if output is expected_output
534 console.log "passed: #{description}."
536 console.log "FAILED: #{description}..."
537 console.log " Expected: #{expected_output}"
538 console.log " Actual: #{output}"
539 html_to_json = (html) ->
540 return JSON.stringify parse_html html
541 test_equals "empty", html_to_json, "", '[]'
542 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
543 test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
544 test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
545 test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
546 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
547 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
548 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
549 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
550 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
551 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'