# HTML parser meant to run in a browser, in support of WYSIWYG editor
# Copyright 2015 Jason Woofenden
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see .
# This file implements a parser for html snippets, meant to be used by a
# WYSIWYG editor. Hence it does not attempt to parse doctypes, ,
# or tags, nor does it produce the top level "document" node in the dom
# tree, nor nodes for html, head or body.
#
# Instead, the data structure produced by this parser is an array of nodes.
#
# Each node is an array. The first element in the array is an integer (one of
# the TYPE_* constants below) followed by the appropriate fields for that type
# (shown below in the comments after the TYPE_* definition.)
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
TYPE_WHITESPACE = 2
TYPE_COMMENT = 3
alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789"
hex_chars = "0123456789abcdefABCDEF"
digits = "0123456789"
# some SVG elements have dashes in them
tag_name_chars = alnum + "-"
# http://www.w3.org/TR/html5/infrastructure.html#space-character
space_chars = "\u0009\u000a\u000c\u000d\u0020"
# https://en.wikipedia.org/wiki/Whitespace_character#Unicode
whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
# These are the character references that don't need a terminating semicolon
# min length: 2, max: 6, none are a prefix of any other.
legacy_char_refs = {
Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
yen: '¥', yuml: 'ÿ'
}
void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
raw_text_elements = ['script', 'style']
escapable_raw_text_elements = ['textarea', 'title']
# http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
svg_elements = [
'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
'view', 'vkern'
]
# http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
mathml_elements = [
'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
'determinant', 'diff', 'divergence', 'divide', 'domain',
'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
'vectorproduct', 'xor'
]
# foreign_elements = [svg_elements..., mathml_elements...]
#normal_elements = All other allowed HTML elements are normal elements.
# decode_named_char_ref()
#
# The list of named character references is _huge_ so ask the browser to decode
# for us instead of wasting bandwidth/space on including the table here.
#
# Pass without the "&" but with the ";" examples:
# for "&" pass "amp;"
# for "′" pass "x2032;"
g_dncr = {
cache: {}
textarea: document.createElement('textarea')
}
# TODO test this in IE8
decode_named_char_ref = (txt) ->
txt = "{txt}"
decoded = g_dncr.cache[txt]
return decoded if decoded?
g_dncr.textarea.innerHTML = txt
decoded = g_dncr.textarea.value
return null if decoded is txt
return g_dncr.cache[txt] = decoded
parse_html = (txt) ->
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
tree_append_point = null
tree_state = null
tok_state = null
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
tok_state_data = ->
if cur >= txt.length
return null
switch c = txt.charAt(cur++)
when '&'
tok_state = tok_state_character_reference_in_data
when '<'
tok_state = tok_state_tag_open
when "\u0000"
# Parse error
return [TYPE_TEXT, c]
else
return [TYPE_TEXT, c]
return null
# & just got consumed
tok_state_character_reference_in_data = ->
tok_state = tok_state_data
if cur >= txt.length
return [TYPE_TEXT, '&']
switch c = txt.charAt(cur)
when ';'
return [TYPE_TEXT, '&']
when '#'
if cur + 1 >= txt.length
return [TYPE_TEXT, '&']
if txt.charAt(cur + 1).toLowerCase() is 'x'
prefix = '#x'
charset = hex_chars
start = cur + 2
else
charset = digits
start = cur + 1
prefix = '#'
i = 0
while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
i += 1
if i is 0
return [TYPE_TEXT, '&']
if txt.charAt(start + i) is ';'
i += 1
decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
if decoded?
cur = start + i
return [TYPE_TEXT, decoded]
return [TYPE_TEXT, '&']
else
for i in [0...31]
if alnum.indexOf(txt.charAt(cur + i)) is -1
break
if i is 0
return [TYPE_TEXT, '&']
if txt.charAt(cur + i) is ';'
i += 1 # include ';' terminator in value
decoded = decode_named_char_ref txt.substr(cur, i)
if decoded?
cur += i
return [TYPE_TEXT, decoded]
return [TYPE_TEXT, '&']
else
# no ';' terminator (only legacy char refs)
if i < 2 or i > 6
return [TYPE_TEXT, '&']
# FIXME: if we're inside an attribute:
# 1. don't parse refs that are followed by =
# 2. don't parse refs that are followed by alnum
max = i
for i in [2..max] # no prefix matches, so ok to check shortest first
c = legacy_char_refs[txt.substr(cur, i)]
if c?
cur += i # consume entity chars
return [TYPE_TEXT, c]
return null
# the functions below impliment the Tree Contstruction algorithm here:
# http://www.w3.org/TR/html5/syntax.html#tree-construction
tree_append = (t) ->
if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
tree_append_point[tree_append_point.length - 1][1] += t[1]
else
tree_append_point.push t
# tree constructor initialization
tree = [] # see comments on TYPE_TAG/etc for the structure of this data
tree_append_point = tree
tree_state = tree_append
# tokenizer initialization
tok_state = tok_state_data
# proccess input
while cur < txt.length
t = tok_state()
if t?
tree_state t
return tree
# everything below is tests on the above
test_equals = (description, fn, args..., expected_output) ->
output = fn.apply this, args
if output is expected_output
console.log "passed: #{description}."
else
console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
html_to_json = (html) ->
return JSON.stringify parse_html html
test_equals "empty", html_to_json, "", '[]'
test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
test_equals "numbered entity overrides", html_to_json, "1 ", '[[1,"1€€ ƒ"]]'