parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an array. The first element in the array is an integer (one of
  26 # the TYPE_* constants below) followed by the appropriate fields for that type
  27 # (shown below in the comments after the TYPE_* definition.)
  28
  29 TYPE_TAG = 0 # name, {attributes}, [children]
  30 TYPE_TEXT = 1 # "text"
  31 TYPE_WHITESPACE = 2
  32 TYPE_COMMENT = 3
  33
  34 alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789"
  35 hex_chars = "0123456789abcdefABCDEF"
  36 digits = "0123456789"
  37
  38 # some SVG elements have dashes in them
  39 tag_name_chars = alnum + "-"
  40
  41 # http://www.w3.org/TR/html5/infrastructure.html#space-character
  42 space_chars = "\u0009\u000a\u000c\u000d\u0020"
  43
  44 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  45 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  46
  47 # These are the character references that don't need a terminating semicolon
  48 # min length: 2, max: 6, none are a prefix of any other.
  49 legacy_char_refs = {
  50         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
  51         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
  52         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
  53         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
  54         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
  55         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
  56         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
  57         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
  58         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
  59         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
  60         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
  61         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
  62         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
  63         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
  64         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
  65         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
  66         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
  67         yen: '¥', yuml: 'ÿ'
  68 }
  69
  70 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
  71 raw_text_elements = ['script', 'style']
  72 escapable_raw_text_elements = ['textarea', 'title']
  73 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
  74 svg_elements = [
  75         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
  76         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
  77         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
  78         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
  79         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
  80         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
  81         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
  82         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
  83         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
  84         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
  85         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
  86         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
  87         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
  88         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
  89         'view', 'vkern'
  90 ]
  91
  92 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
  93 mathml_elements = [
  94         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
  95         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
  96         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
  97         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
  98         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
  99         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 100         'determinant', 'diff', 'divergence', 'divide', 'domain',
 101         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 102         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 103         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 104         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 105         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 106         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 107         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 108         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 109         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 110         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 111         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 112         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 113         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 114         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 115         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 116         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 117         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 118         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 119         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 120         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 121         'vectorproduct', 'xor'
 122 ]
 123 # foreign_elements = [svg_elements..., mathml_elements...]
 124 #normal_elements = All other allowed HTML elements are normal elements.
 125
 126
 127 # decode_named_char_ref()
 128 #
 129 # The list of named character references is _huge_ so ask the browser to decode
 130 # for us instead of wasting bandwidth/space on including the table here.
 131 #
 132 # Pass without the "&" but with the ";" examples:
 133 #    for "&amp" pass "amp;"
 134 #    for "&#x2032" pass "x2032;"
 135 g_dncr = {
 136         cache: {}
 137         textarea: document.createElement('textarea')
 138 }
 139 # TODO test this in IE8
 140 decode_named_char_ref = (txt) ->
 141         txt = "&#{txt}"
 142         decoded = g_dncr.cache[txt]
 143         return decoded if decoded?
 144         g_dncr.textarea.innerHTML = txt
 145         decoded = g_dncr.textarea.value
 146         return null if decoded is txt
 147         return g_dncr.cache[txt] = decoded
 148
 149 parse_html = (txt) ->
 150         cur = 0 # index of next char in txt to be parsed
 151         # declare tree and tokenizer variables so they're in scope below
 152         tree = null
 153         tree_append_point = null
 154         tree_state = null
 155         tok_state = null
 156
 157
 158         # the functions below implement the tokenizer stats described here:
 159         # http://www.w3.org/TR/html5/syntax.html#tokenization
 160
 161         tok_state_data = ->
 162                 if cur >= txt.length
 163                         return null
 164                 switch c = txt.charAt(cur++)
 165                         when '&'
 166                                 tok_state = tok_state_character_reference_in_data
 167                         when '<'
 168                                 tok_state = tok_state_tag_open
 169                         when "\u0000"
 170                                 # Parse error
 171                                 return [TYPE_TEXT, c]
 172                         else
 173                                 return [TYPE_TEXT, c]
 174                 return null
 175
 176         # & just got consumed
 177         tok_state_character_reference_in_data = ->
 178                 tok_state = tok_state_data
 179                 if cur >= txt.length
 180                         return [TYPE_TEXT, '&']
 181                 switch c = txt.charAt(cur)
 182                         when ';'
 183                                 return [TYPE_TEXT, '&']
 184                         when '#'
 185                                 if cur + 1 >= txt.length
 186                                         return [TYPE_TEXT, '&']
 187                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 188                                         prefix = '#x'
 189                                         charset = hex_chars
 190                                         start = cur + 2
 191                                 else
 192                                         charset = digits
 193                                         start = cur + 1
 194                                         prefix = '#'
 195                                 i = 0
 196                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 197                                         i += 1
 198                                 if i is 0
 199                                         return [TYPE_TEXT, '&']
 200                                 if txt.charAt(start + i) is ';'
 201                                         i += 1
 202                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 203                                 if decoded?
 204                                         cur = start + i
 205                                         return [TYPE_TEXT, decoded]
 206                                 return [TYPE_TEXT, '&']
 207                         else
 208                                 for i in [0...31]
 209                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 210                                                 break
 211                                 if i is 0
 212                                         return [TYPE_TEXT, '&']
 213                                 if txt.charAt(cur + i) is ';'
 214                                         i += 1 # include ';' terminator in value
 215                                         decoded = decode_named_char_ref txt.substr(cur, i)
 216                                         if decoded?
 217                                                 cur += i
 218                                                 return [TYPE_TEXT, decoded]
 219                                         return [TYPE_TEXT, '&']
 220                                 else
 221                                         # no ';' terminator (only legacy char refs)
 222                                         if i < 2 or i > 6
 223                                                 return [TYPE_TEXT, '&']
 224                                         # FIXME: if we're inside an attribute:
 225                                         # 1.    don't parse refs that are followed by =
 226                                         # 2.    don't parse refs that are followed by alnum
 227                                         max = i
 228                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 229                                                 c = legacy_char_refs[txt.substr(cur, i)]
 230                                                 if c?
 231                                                         cur += i # consume entity chars
 232                                                         return [TYPE_TEXT, c]
 233                 return null
 234
 235
 236         # the functions below impliment the Tree Contstruction algorithm here:
 237         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 238         tree_append = (t) ->
 239                 if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
 240                         tree_append_point[tree_append_point.length - 1][1] += t[1]
 241                 else
 242                         tree_append_point.push t
 243
 244         # tree constructor initialization
 245         tree = [] # see comments on TYPE_TAG/etc for the structure of this data
 246         tree_append_point = tree
 247         tree_state = tree_append
 248
 249         # tokenizer initialization
 250         tok_state = tok_state_data
 251
 252         # proccess input
 253         while cur < txt.length
 254                 t = tok_state()
 255                 if t?
 256                         tree_state t
 257
 258         return tree
 259
 260 # everything below is tests on the above
 261 test_equals = (description, fn, args..., expected_output) ->
 262         output = fn.apply this, args
 263         if output is expected_output
 264                 console.log "passed: #{description}."
 265         else
 266                 console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
 267 html_to_json = (html) ->
 268         return JSON.stringify parse_html html
 269 test_equals "empty", html_to_json, "", '[]'
 270 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 271 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 272 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 273 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'