1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
23 # Instead, the data structure produced by this parser is an array of nodes.
25 # Each node is an array. The first element in the array is an integer (one of
26 # the TYPE_* constants below) followed by the appropriate fields for that type
27 # (shown below in the comments after the TYPE_* definition.)
29 TYPE_TAG = 0 # name, {attributes}, [children]
30 TYPE_TEXT = 1 # "text"
33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
35 TYPE_CLOSE_TAG = 5 # name
38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
41 alnum = lc_alpha + uc_alpha + digits
42 hex_chars = digits + "abcdefABCDEF"
44 # some SVG elements have dashes in them
45 tag_name_chars = alnum + "-"
47 # http://www.w3.org/TR/html5/infrastructure.html#space-character
48 space_chars = "\u0009\u000a\u000c\u000d\u0020"
50 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
51 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
53 # These are the character references that don't need a terminating semicolon
54 # min length: 2, max: 6, none are a prefix of any other.
56 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
57 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
58 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
59 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
60 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
61 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
62 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
63 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
64 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
65 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
66 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
67 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
68 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
69 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
70 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
71 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
72 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
76 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
77 raw_text_elements = ['script', 'style']
78 escapable_raw_text_elements = ['textarea', 'title']
79 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
81 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
82 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
83 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
84 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
85 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
86 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
87 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
88 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
89 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
90 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
91 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
92 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
93 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
94 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
98 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
100 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
101 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
102 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
103 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
104 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
105 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
106 'determinant', 'diff', 'divergence', 'divide', 'domain',
107 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
108 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
109 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
110 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
111 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
112 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
113 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
114 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
115 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
116 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
117 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
118 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
119 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
120 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
121 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
122 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
123 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
124 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
125 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
126 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
127 'vectorproduct', 'xor'
129 # foreign_elements = [svg_elements..., mathml_elements...]
130 #normal_elements = All other allowed HTML elements are normal elements.
133 # decode_named_char_ref()
135 # The list of named character references is _huge_ so ask the browser to decode
136 # for us instead of wasting bandwidth/space on including the table here.
138 # Pass without the "&" but with the ";" examples:
139 # for "&" pass "amp;"
140 # for "′" pass "x2032;"
143 textarea: document.createElement('textarea')
145 # TODO test this in IE8
146 decode_named_char_ref = (txt) ->
148 decoded = g_dncr.cache[txt]
149 return decoded if decoded?
150 g_dncr.textarea.innerHTML = txt
151 decoded = g_dncr.textarea.value
152 return null if decoded is txt
153 return g_dncr.cache[txt] = decoded
155 parse_html = (txt) ->
156 cur = 0 # index of next char in txt to be parsed
157 # declare tree and tokenizer variables so they're in scope below
159 open_tags = [] # stack of open elements
162 tok_cur_tag = null # partially parsed tag
165 console.log "Parse error at character #{cur} of #{txt.length}"
167 # the functions below implement the tokenizer stats described here:
168 # http://www.w3.org/TR/html5/syntax.html#tokenization
170 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
172 switch c = txt.charAt(cur++)
174 return [TYPE_TEXT, tokenize_character_reference()]
176 tok_state = tok_state_tag_open
179 return [TYPE_TEXT, c]
183 return [TYPE_TEXT, c]
186 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
187 # not needed: tok_state_character_reference_in_data = ->
188 # just call tok_state_character_reference_in_data()
190 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
191 tok_state_tag_open = ->
192 switch c = txt.charAt(cur++)
194 tok_state = tok_state_markup_declaration_open
196 tok_state = tok_state_end_tag_open
199 tok_state = tok_state_bogus_comment
201 if lc_alpha.indexOf(c) > -1
202 tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
203 tok_state = tok_state_tag_name
204 else if uc_alpha.indexOf(c) > -1
205 tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
206 tok_state = tok_state_tag_name
209 tok_state = tok_state_data
210 cur -= 1 # we didn't parse/handle the char after <
211 return [TYPE_TEXT, '<']
214 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
215 tok_state_end_tag_open = ->
216 switch c = txt.charAt(cur++)
219 tok_state = tok_state_data
222 tok_state = tok_state_data
223 return [TYPE_TEXT, '</']
225 if uc_alpha.indexOf(c) > -1
226 tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
227 tok_state = tok_state_tag_name
228 else if lc_alpha.indexOf(c) > -1
229 tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
230 tok_state = tok_state_tag_name
233 tok_state = tok_state_bogus_comment
236 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
237 tok_state_tag_name = ->
238 switch c = txt.charAt(cur++)
239 when "\t", "\n", "\u000c", ' '
240 tok_state = tok_state_before_attribute_name
242 tok_state = tok_state_self_closing_start_tag
244 tok_state = tok_state_data
250 tok_cur_tag[1] += "\ufffd"
253 tok_state = tok_state_data
255 if uc_alpha.indexOf(c) > -1
256 tok_cur_tag[1] += c.toLowerCase()
261 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
262 tok_state_before_attribute_name = ->
264 switch c = txt.charAt(cur++)
265 when "\t", "\n", "\u000c", ' '
268 tok_state = tok_state_self_closing_start_tag
271 tok_state = tok_state_data
278 when '"', "'", '<', '='
283 tok_state = tok_state_data
285 if uc_alpha.indexOf(c) > -1
286 attr_name = c.toLowerCase()
290 tok_cur_tag[2].unshift [attr_name, '']
291 tok_state = tok_state_attribute_name
294 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
295 tok_state_attribute_name = ->
296 switch c = txt.charAt(cur++)
297 when "\t", "\n", "\u000c", ' '
298 tok_state = tok_state_after_attribute_name
300 tok_state = tok_state_self_closing_start_tag
302 tok_state = tok_state_before_attribute_value
304 tok_state = tok_state_data
310 tok_cur_tag[2][0][0] = "\ufffd"
313 tok_cur_tag[2][0][0] = c
316 tok_state = tok_state_data
318 if uc_alpha.indexOf(c) > -1
319 tok_cur_tag[2][0][0] = c.toLowerCase()
321 tok_cur_tag[2][0][0] += c
324 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
325 tok_state_before_attribute_value = ->
326 switch c = txt.charAt(cur++)
327 when "\t", "\n", "\u000c", ' '
330 tok_state = tok_state_attribute_value_double_quoted
332 tok_state = tok_state_attribute_value_unquoted
335 tok_state = tok_state_attribute_value_single_quoted
338 tok_cur_tag[2][0][1] += "\ufffd"
339 tok_state = tok_state_attribute_value_unquoted
342 tok_state = tok_state_data
348 tok_state = tok_state_data
350 tok_cur_tag[2][0][1] += c
351 tok_state = tok_state_attribute_value_unquoted
354 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
355 tok_state_attribute_value_double_quoted = ->
356 switch c = txt.charAt(cur++)
358 tok_state = tok_state_after_attribute_value_quoted
360 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
363 tok_cur_tag[2][0][1] += "\ufffd"
366 tok_state = tok_state_data
368 tok_cur_tag[2][0][1] += c
371 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
372 tok_state_attribute_value_single_quoted = ->
373 switch c = txt.charAt(cur++)
375 tok_state = tok_state_after_attribute_value_quoted
377 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
380 tok_cur_tag[2][0][1] += "\ufffd"
383 tok_state = tok_state_data
385 tok_cur_tag[2][0][1] += c
388 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
389 tok_state_attribute_value_unquoted = ->
390 switch c = txt.charAt(cur++)
391 when "\t", "\n", "\u000c", ' '
392 tok_state = tok_state_before_attribute_name
394 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
396 tok_state = tok_state_data
401 tok_cur_tag[2][0][1] += "\ufffd"
404 tok_state = tok_state_data
406 # Parse Error if ', <, = or ` (backtick)
407 tok_cur_tag[2][0][1] += c
410 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
411 tok_state_after_attribute_value_quoted = ->
412 switch c = txt.charAt(cur++)
413 when "\t", "\n", "\u000c", ' '
414 tok_state = tok_state_before_attribute_name
416 tok_state = tok_state_self_closing_start_tag
418 tok_state = tok_state_data
424 tok_state = tok_state_data
427 tok_state = tok_state_before_attribute_name
428 cur -= 1 # we didn't handle that char
431 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
432 # Don't set this as a state, just call it
433 # returns a string (NOT a text node)
434 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
437 switch c = txt.charAt(cur)
438 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
439 # explicitly not a parse error
442 # there has to be "one or more" alnums between & and ; to be a parse error
445 if cur + 1 >= txt.length
447 if txt.charAt(cur + 1).toLowerCase() is 'x'
456 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
460 if txt.charAt(start + i) is ';'
462 # FIXME This is supposed to generate parse errors for some chars
463 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
470 if alnum.indexOf(txt.charAt(cur + i)) is -1
473 # exit early, because parse_error() below needs at least one alnum
475 if txt.charAt(cur + i) is ';'
476 i += 1 # include ';' terminator in value
477 decoded = decode_named_char_ref txt.substr(cur, i)
484 # no ';' terminator (only legacy char refs)
486 for i in [2..max] # no prefix matches, so ok to check shortest first
487 c = legacy_char_refs[txt.substr(cur, i)]
490 if txt.charAt(cur + i) is '='
491 # "because some legacy user agents will
492 # misinterpret the markup in those cases"
495 if alnum.indexOf(txt.charAt(cur + i)) > -1
496 # this makes attributes forgiving about url args
498 # ok, and besides the weird exceptions for attributes...
499 # return the matching char
500 cur += i # consume entity chars
501 parse_error() # because no terminating ";"
505 return # never reached
507 # the functions below impliment the Tree Contstruction algorithm here:
508 # http://www.w3.org/TR/html5/syntax.html#tree-construction
509 # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
513 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
514 open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
516 open_tags[0][3].push t
519 # convert attributes into a hash
525 # FIXME probs have to auto-close things first
526 open_tags[0][3].push t
528 # TODO implement formatting elements thing
530 # FIXME this is just a hack for now
531 if open_tags.length < 2
534 if open_tags[0][1] isnt t[1]
536 # fall through and close something anyway
540 # TODO implement close tags
541 # TODO implement self-closing tags
543 console.log "UNIMPLEMENTED tag type: #{t[0]}"
546 # tree constructor initialization
547 # see comments on TYPE_TAG/etc for the structure of this data
548 tree = [TYPE_TAG, 'html', {}, []]
550 tree_state = tree_append
552 # tokenizer initialization
553 tok_state = tok_state_data
562 return # never reached
564 # everything below is tests on the above
565 test_equals = (description, fn, args..., expected_output) ->
566 output = fn.apply this, args
567 if output is expected_output
568 console.log "passed: #{description}."
570 console.log "FAILED: #{description}..."
571 console.log " Expected: #{expected_output}"
572 console.log " Actual: #{output}"
573 html_to_json = (html) ->
574 return JSON.stringify parse_html html
575 test_equals "empty", html_to_json, "", '[]'
576 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
577 test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
578 test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
579 test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
580 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
581 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
582 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
583 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
584 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
585 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
586 test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'