# HTML parser meant to run in a browser, in support of WYSIWYG editor # Copyright 2015 Jason Woofenden # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) any # later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more # details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # This file implements a parser for html snippets, meant to be used by a # WYSIWYG editor. Hence it does not attempt to parse doctypes, , # or tags, nor does it produce the top level "document" node in the dom # tree, nor nodes for html, head or body. Comments containing "fixfull" # indicate places where additional code is needed for full HTML document # parsing. # # Instead, the data structure produced by this parser is an array of nodes. # # Each node is an obect of the Node class. Here are the Node types: TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" TYPE_COMMENT = 2 TYPE_DOCTYPE = 3 # the following types are emited by the tokenizer, but shouldn't end up in the tree: TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] TYPE_END_TAG = 5 # name TYPE_EOF = 6 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm # namespace constants NS_HTML = 1 NS_MATHML = 2 NS_SVG = 3 class Node constructor: (type, args = {}) -> @type = type # one of the TYPE_* constants above @name = args.name ? '' # tag name @text = args.text ? '' # contents for text/comment nodes @attrs = args.attrs ? {} @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only @children = args.children ? [] @namespace = args.namespace ? NS_HTML @parent = args.parent ? null shallow_clone: -> # return a new node that's the same except without the children or parent # WARNING this doesn't work right on open tags that are still being parsed attrs = {} attrs[k] = v for k, v of @attrs return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace serialize: -> # for unit tests ret = '' switch @type when TYPE_TAG ret += 'tag:' ret += JSON.stringify @name ret += ',' ret += JSON.stringify @attrs ret += ',[' sep = '' for c in @children ret += sep sep = ',' ret += c.serialize() ret += ']' when TYPE_TEXT ret += 'text:' ret += JSON.stringify @text when TYPE_COMMENT ret += 'comment:' ret += JSON.stringify @text when TYPE_DOCTYPE ret += 'doctype' # FIXME else ret += 'unknown:' console.log "unknown: #{JSON.stringify @}" # backtrace is just as well return ret # helpers: (only take args that are normally known when parser creates nodes) new_open_tag = (name) -> return new Node TYPE_OPEN_TAG, name: name new_end_tag = (name) -> return new Node TYPE_END_TAG, name: name new_text_node = (txt) -> return new Node TYPE_TEXT, text: txt new_comment_node = (txt) -> return new Node TYPE_COMMENT, text: txt new_eof_token = -> return new Node TYPE_EOF new_aaa_bookmark = -> return new Node TYPE_AAA_BOOKMARK lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" # some SVG elements have dashes in them tag_name_chars = alnum + "-" # http://www.w3.org/TR/html5/infrastructure.html#space-character space_chars = "\u0009\u000a\u000c\u000d\u0020" # https://en.wikipedia.org/wiki/Whitespace_character#Unicode whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000" # These are the character references that don't need a terminating semicolon # min length: 2, max: 6, none are a prefix of any other. legacy_char_refs = { Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ', aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å', aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦', Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©', curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê', ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë', euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>', Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì', igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<', lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬', Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô', Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø', Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±', pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§', shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ', times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù', ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý', yen: '¥', yuml: 'ÿ' } void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'] raw_text_elements = ['script', 'style'] escapable_raw_text_elements = ['textarea', 'title'] # http://www.w3.org/TR/SVG/ 1.1 (Second Edition) svg_elements = [ 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor', 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile', 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix', 'feComponentTransfer', 'feComposite', 'feConvolveMatrix', 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood', 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage', 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight', 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter', 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src', 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern', 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata', 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline', 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg', 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use', 'view', 'vkern' ] # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition mathml_elements = [ 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos', 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech', 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card', 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain', 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot', 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree', 'determinant', 'diff', 'divergence', 'divide', 'domain', 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma', 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor', 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary', 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect', 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit', 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup', 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median', 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min', 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode', 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts', 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline', 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers', 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset', 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece', 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient', 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct', 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff', 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto', 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector', 'vectorproduct', 'xor' ] # foreign_elements = [svg_elements..., mathml_elements...] #normal_elements = All other allowed HTML elements are normal elements. special_elements = { # HTML: address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML, aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML, blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML, caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML, details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML, embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML, footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML, header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML, img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML, listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML, # MathML: mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML, 'annotation-xml':NS_MATHML, # SVG: foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG } formatting_elements = { a: true, b: true, big: true, code: true, em: true, font: true, i: true, nobr: true, s: true, small: true, strike: true, strong: true, tt: true, u: true } el_is_special = (e) -> return special_elements[e] is e.namespace # decode_named_char_ref() # # The list of named character references is _huge_ so ask the browser to decode # for us instead of wasting bandwidth/space on including the table here. # # Pass without the "&" but with the ";" examples: # for "&" pass "amp;" # for "′" pass "x2032;" g_dncr = { cache: {} textarea: document.createElement('textarea') } # TODO test this in IE8 decode_named_char_ref = (txt) -> txt = "&#{txt}" decoded = g_dncr.cache[txt] return decoded if decoded? g_dncr.textarea.innerHTML = txt decoded = g_dncr.textarea.value return null if decoded is txt return g_dncr.cache[txt] = decoded parse_html = (txt, parse_error_cb = null) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null open_els = [] # stack of open elements tree_state = null tok_state = null tok_cur_tag = null # partially parsed tag flag_frameset_ok = null flag_parsing = null flag_foster_parenting = null afe = [] # active formatting elements parse_error = -> if parse_error_cb? parse_error_cb cur else console.log "Parse error at character #{cur} of #{txt.length}" # the functions below impliment the Tree Contstruction algorithm # http://www.w3.org/TR/html5/syntax.html#tree-construction # But first... the helpers template_tag_is_open = -> for t in open_els if t.type is TYPE_TAG and t.name is 'template' return true return false is_in_scope_x = (tag_name, scope) -> for t in open_els if t.name is tag_name return true if t.name of scope return false return false is_in_scope_x_y = (tag_name, scope, scope2) -> for t in open_els if t.name is tag_name return true if t.name of scope return false if t.name of scope2 return false return false standard_scopers = { # FIXME these are supposed to be namespace specific 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true, 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true, 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true, 'foreignObject': true, 'desc': true, 'title' } button_scopers = button: true li_scopers = ol: true, ul: true table_scopers = html: true, table: true, template: true is_in_scope = (tag_name) -> return is_in_scope_x tag_name, standard_scopers is_in_button_scope = (tag_name) -> return is_in_scope_x_y tag_name, standard_scopers, button_scopers is_in_table_scope = (tag_name) -> return is_in_scope_x tag_name, table_scopers is_in_select_scope = (tag_name) -> for t in open_els if t.name is tag_name return true if t.name isnt 'optgroup' and t.name isnt 'option' return false return false # this checks for a particular element, not by name el_is_in_scope = (el) -> for t in open_els if t is el return true if t.name of standard_scopers return false return false # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements # this implementation is structured (mostly) as described at the link above. # capitalized comments are the "labels" described at the link above. reconstruct_active_formatting_elements = -> return if afe.length is 0 if afe[0].type is TYPE_MARKER or afe[0] in open_els return # Rewind i = 0 loop if i is afe.length - 1 break i += 1 if afe[i].type is TYPE_MARKER or afe[i] in open_els i -= 1 # Advance break # Create loop el = afe[i].shallow_clone() tree_insert_element el afe[i] = el break if i is 0 i -= 1 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm # adoption agency algorithm adoption_agency = (subject) -> if open_els[0].name is subject el = open_els[0] open_els.shift() # remove it from the list of active formatting elements (if found) for t, i in afe if t is el afe.splice i, 1 break return outer = 0 loop if outer >= 8 return outer += 1 fe = null for t, fe_index in afe if t.type is TYPE_MARKER break if t.name is subject fe = t break if fe is null in_body_any_other_end_tag subject return in_open_els = false for t in open_els if t is fe in_open_els = true break unless in_open_els parse_error() # "remove it from the list" must mean afe, since it's not in open_els afe.splice fe_index, 1 return unless el_is_in_scope fe parse_error() return unless open_els[0] is fe parse_error() # continue fb = null fb_index for t, i in open_els if t is fe break if el_is_special t fb = t fb_index = i if fb is null loop t = open_els.shift() if t is fe afe.splice fe_index, 1 return ca = open_els[fe_index + 1] # common ancestor node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list. bookmark = new_aaa_bookmark() for t, i in afe if t is fe afe.splice i, 0, bookmark node = last_node = fb inner = 0 loop inner += 1 node_next = null for t, i in open_els if t is node node_next = open_els[i + 1] break node = node_next ? node_above # TODO make sure node_above gets re-set if/when node is removed from open_els if node is fe break node_in_afe = false for t, i of afe if t is node if inner > 3 afe.splice i, 1 else node_in_afe = true break unless node_in_afe for t, i in open_els if t is node node_above = open_els[i + 1] open_els.splice i, 1 break continue # 7. reate an element for the token for which the element node # was created, in the HTML namespace, with common ancestor as # the intended parent; replace the entry for node in the list # of active formatting elements with an entry for the new # element, replace the entry for node in the stack of open # elements with an entry for the new element, and let node be # the new element. new_node = node.shallow_clone() for t, i in afe if t is node afe[i] = new_node break for t, i in open_els if t is node open_els[i] = new_node break node = new_node # 8. If last node is furthest block, then move the # aforementioned bookmark to be immediately after the new node # in the list of active formatting elements. if last_node is fb for t, i in afe if t is bookmark afe.splice i, 1 for t, i in afe if t is node # TODO test: position i gets you "after"? afe.splice i, 0, new_aaa_bookmark() # 9. Insert last node into node, first removing it from its # previous parent node if any. if last_node.parent? for c, i of last_node.parent.children if c is last_node last_node.parent.children.splice i, 1 node.children.push last_node last_node.parent = node # 10. Let last node be node. last_node = node # 11. Return to the step labeled inner loop. # 14. Insert whatever last node ended up being in the previous step # at the appropriate place for inserting a node, but using common # ancestor as the override target. tree_insert_element last_node, ca # 15. Create an element for the token for which formatting element # was created, in the HTML namespace, with furthest block as the # intended parent. new_element = fe.shallow_clone() # 16. Take all of the child nodes of furthest block and append them # to the element created in the last step. while fb.children.length t = fb.children.shift() t.parent = new_element new_element.children.push t # 17. Append that new element to furthest block. new_element.parent = fb fb.children.push new_element # 18. Remove formatting element from the list of active formatting # elements, and insert the new element into the list of active # formatting elements at the position of the aforementioned # bookmark. for t, i in afe if t is fe afe.splice i, 1 break for t, i in afe if t is bookmark afe[i] = node break # 19. Remove formatting element from the stack of open elements, # and insert the new element into the stack of open elements # immediately below the position of furthest block in that stack. for t, i of open_els if t is fe open_els.splice i, 1 break for t, i of open_els if t is fb open_els.splice i, 0, new_element break # 20. Jump back to the step labeled outer loop. # http://www.w3.org/TR/html5/syntax.html#close-a-p-element # FIXME implement this close_p_if_in_button_scope = -> if open_els[0].name is 'p' open_els.pop() return #p = find_button_scope 'p' #if p? # TODO generate_implied_end_tags except for p tags # TODO parse_error unless open_els[0].name is 'p' # TODO pop stack until 'p' popped # http://www.w3.org/TR/html5/syntax.html#insert-a-character tree_insert_text = (t) -> dest = adjusted_insertion_location() if dest[1] > 0 prev = dest[0].children[dest[1] - 1] if prev.type is TYPE_TEXT prev.text += t.text return dest[0].children.splice dest[1], 0, t # 8.2.5.1 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node adjusted_insertion_location = (override_target = null) -> # 1. If there was an override target specified, then let target be the # override target. if override_target? target = override_target else # Otherwise, let target be the current node. target = open_els[0] # 2. Determine the adjusted insertion location using the first matching # steps from the following list: # # If foster parenting is enabled and target is a table, tbody, tfoot, # thead, or tr element Foster parenting happens when content is # misnested in tables. if flag_foster_parenting and target.name in foster_parenting_targets console.log "foster parenting isn't implemented yet" # TODO # 1. Let last template be the last template element in the stack of # open elements, if any. # 2. Let last table be the last table element in the stack of open # elements, if any. # 3. If there is a last template and either there is no last table, # or there is one, but last template is lower (more recently added) # than last table in the stack of open elements, then: let adjusted # insertion location be inside last template's template contents, # after its last child (if any), and abort these substeps. # 4. If there is no last table, then let adjusted insertion # location be inside the first element in the stack of open # elements (the html element), after its last child (if any), and # abort these substeps. (fragment case) # 5. If last table has a parent element, then let adjusted # insertion location be inside last table's parent element, # immediately before last table, and abort these substeps. # 6. Let previous element be the element immediately above last # table in the stack of open elements. # 7. Let adjusted insertion location be inside previous element, # after its last child (if any). # Note: These steps are involved in part because it's possible for # elements, the table element in this case in particular, to have # been moved by a script around in the DOM, or indeed removed from # the DOM entirely, after the element was inserted by the parser. else # Otherwise Let adjusted insertion location be inside target, after # its last child (if any). target_i = target.children.length # 3. If the adjusted insertion location is inside a template element, # let it instead be inside the template element's template contents, # after its last child (if any). TODO # 4. Return the adjusted insertion location. return [target, target_i] # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token # aka create_an_element_for_token token_to_element = (t, namespace, intended_parent) -> t.type = TYPE_TAG # not TYPE_OPEN_TAG # convert attributes into a hash attrs = {} while t.attrs_a.length a = t.attrs_a.pop() attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs # TODO 2. If the newly created element has an xmlns attribute in the # XMLNS namespace whose value is not exactly the same as the element's # namespace, that is a parse error. Similarly, if the newly created # element has an xmlns:xlink attribute in the XMLNS namespace whose # value is not the XLink Namespace, that is a parse error. # fixfull: the spec says stuff about form pointers and ownerDocument return el # FIXME read implement "foster parenting" part # FIXME read spec, do this right # FIXME implement the override target thing # note: this assumes it's an open tag # TODO tree_insert_html_element = (t, ... tree_insert_element = (el, override_target = null, namespace = null) -> dest = adjusted_insertion_location override_target if el.type is TYPE_OPEN_TAG # means it's a "token" el = token_to_element el, namespace, dest[0] # fixfull: Document nodes sometimes can't accept more chidren dest[0].children.splice dest[1], 0, el el.parent = dest[0] open_els.unshift el return el # http://www.w3.org/TR/html5/syntax.html#insert-a-comment tree_insert_a_comment = (t) -> # FIXME read spec for "adjusted insertion location, etc, this might be wrong open_els[0].children.push t # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it for node, i in open_els if node.name is name # FIXME generate implied end tags except those with name==name parse_error() unless i is 0 while i > 0 open_els.shift() i -= 1 open_els.shift() return if special_elements[node.name]? parse_error() return tree_in_body = (t) -> switch t.type when TYPE_TEXT switch t.text when "\u0000" parse_error() when "\t", "\u000a", "\u000c", "\u000d", ' ' reconstruct_active_formatting_elements() tree_insert_text t else reconstruct_active_formatting_elements() tree_insert_text t flag_frameset_ok = false when TYPE_COMMENT tree_insert_a_comment t when TYPE_DOCTYPE parse_error() when TYPE_OPEN_TAG switch t.name when 'html' parse_error() return if template_tag_is_open() root_attrs = open_els[open_els.length - 1].children for k, v of t.attrs root_attrs[k] = v unless root_attrs[k]? when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' # FIXME also do this for (end tag) return tree_in_head t when 'body' parse_error() # TODO when 'frameset' parse_error() # TODO when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul' close_p_if_in_button_scope() tree_insert_element t when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' close_p_if_in_button_scope() if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] parse_error() open_els.shift() tree_insert_element t # TODO lots more to implement here when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u' reconstruct_active_formatting_elements() el = tree_insert_element t afe.push el # TODO lots more to implement here else # any other start tag reconstruct_active_formatting_elements() tree_insert_element t when TYPE_EOF ok_tags = { dd: true, dt: true, li: true, p: true, tbody: true, td: true, tfoot: true, th: true, thead: true, tr: true, body: true, html: true, } for t in open_els unless ok_tags[t.name]? parse_error() break # TODO stack of template insertion modes thing flag_parsing = false # stop parsing when TYPE_END_TAG switch t.name when 'body' unless is_in_scope 'body' parse_error() return # TODO implement parse error and move to tree_after_body when 'html' unless is_in_scope 'body' # weird, but it's what the spec says parse_error() return # TODO implement parse error and move to tree_after_body, reprocess # TODO lots more close tags to implement here when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u' adoption_agency t.name # TODO lots more close tags to implement here else in_body_any_other_end_tag t.name return # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> switch c = txt.charAt(cur++) when '&' return new_text_node tokenize_character_reference() when '<' tok_state = tok_state_tag_open when "\u0000" parse_error() return new_text_node c when '' # EOF return new_eof_token() else return new_text_node c return null # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state # not needed: tok_state_character_reference_in_data = -> # just call tok_state_character_reference_in_data() # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> switch c = txt.charAt(cur++) when '!' tok_state = tok_state_markup_declaration_open when '/' tok_state = tok_state_end_tag_open when '?' parse_error() tok_state = tok_state_bogus_comment else if lc_alpha.indexOf(c) > -1 tok_cur_tag = new_open_tag c tok_state = tok_state_tag_name else if uc_alpha.indexOf(c) > -1 tok_cur_tag = new_open_tag c.toLowerCase() tok_state = tok_state_tag_name else parse_error() tok_state = tok_state_data cur -= 1 # we didn't parse/handle the char after < return new_text_node '<' return null # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state tok_state_end_tag_open = -> switch c = txt.charAt(cur++) when '>' parse_error() tok_state = tok_state_data when '' # EOF parse_error() tok_state = tok_state_data return new_text_node ' -1 tok_cur_tag = new_end_tag c.toLowerCase() tok_state = tok_state_tag_name else if lc_alpha.indexOf(c) > -1 tok_cur_tag = new_end_tag c tok_state = tok_state_tag_name else parse_error() tok_state = tok_state_bogus_comment return null # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state tok_state_tag_name = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" parse_error() tok_cur_tag.name += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 tok_cur_tag.name += c.toLowerCase() else tok_cur_tag.name += c return null # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state tok_state_before_attribute_name = -> attr_name = null switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' return null when '/' tok_state = tok_state_self_closing_start_tag return null when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" parse_error() attr_name = "\ufffd" when '"', "'", '<', '=' parse_error() attr_name = c when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 attr_name = c.toLowerCase() else attr_name = c if attr_name? tok_cur_tag.attrs_a.unshift [attr_name, ''] tok_state = tok_state_attribute_name return null # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state tok_state_attribute_name = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_after_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '=' tok_state = tok_state_before_attribute_value when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" parse_error() tok_cur_tag.attrs_a[0][0] = "\ufffd" when '"', "'", '<' parse_error() tok_cur_tag.attrs_a[0][0] = c when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 tok_cur_tag.attrs_a[0][0] = c.toLowerCase() else tok_cur_tag.attrs_a[0][0] += c return null # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state tok_state_before_attribute_value = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' return null when '"' tok_state = tok_state_attribute_value_double_quoted when '&' tok_state = tok_state_attribute_value_unquoted cur -= 1 when "'" tok_state = tok_state_attribute_value_single_quoted when "\u0000" # Parse error tok_cur_tag.attrs_a[0][1] += "\ufffd" tok_state = tok_state_attribute_value_unquoted when '>' # Parse error tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when '' # EOF parse_error() tok_state = tok_state_data else tok_cur_tag.attrs_a[0][1] += c tok_state = tok_state_attribute_value_unquoted return null # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state tok_state_attribute_value_double_quoted = -> switch c = txt.charAt(cur++) when '"' tok_state = tok_state_after_attribute_value_quoted when '&' tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true when "\u0000" # Parse error tok_cur_tag.attrs_a[0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else tok_cur_tag.attrs_a[0][1] += c return null # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state tok_state_attribute_value_single_quoted = -> switch c = txt.charAt(cur++) when "'" tok_state = tok_state_after_attribute_value_quoted when '&' tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true when "\u0000" # Parse error tok_cur_tag.attrs_a[0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else tok_cur_tag.attrs_a[0][1] += c return null # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state tok_state_attribute_value_unquoted = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '&' tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" tok_cur_tag.attrs_a[0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else # Parse Error if ', <, = or ` (backtick) tok_cur_tag.attrs_a[0][1] += c return null # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state tok_state_after_attribute_value_quoted = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when '' # EOF parse_error() tok_state = tok_state_data else # Parse Error tok_state = tok_state_before_attribute_name cur -= 1 # we didn't handle that char return null # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it # returns a string (NOT a text node) tokenize_character_reference = (allowed_char = null, in_attr = false) -> if cur >= txt.length return '&' switch c = txt.charAt(cur) when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char # explicitly not a parse error return '&' when ';' # there has to be "one or more" alnums between & and ; to be a parse error return '&' when '#' if cur + 1 >= txt.length return '&' if txt.charAt(cur + 1).toLowerCase() is 'x' prefix = '#x' charset = hex_chars start = cur + 2 else charset = digits start = cur + 1 prefix = '#' i = 0 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 i += 1 if i is 0 return '&' if txt.charAt(start + i) is ';' i += 1 # FIXME This is supposed to generate parse errors for some chars decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) if decoded? cur = start + i return decoded return '&' else for i in [0...31] if alnum.indexOf(txt.charAt(cur + i)) is -1 break if i is 0 # exit early, because parse_error() below needs at least one alnum return '&' if txt.charAt(cur + i) is ';' i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) if decoded? cur += i return decoded parse_error() return '&' else # no ';' terminator (only legacy char refs) max = i for i in [2..max] # no prefix matches, so ok to check shortest first c = legacy_char_refs[txt.substr(cur, i)] if c? if in_attr if txt.charAt(cur + i) is '=' # "because some legacy user agents will # misinterpret the markup in those cases" parse_error() return '&' if alnum.indexOf(txt.charAt(cur + i)) > -1 # this makes attributes forgiving about url args return '&' # ok, and besides the weird exceptions for attributes... # return the matching char cur += i # consume entity chars parse_error() # because no terminating ";" return c parse_error() return '&' return # never reached # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data tree = new Node TYPE_TAG, name: 'html' open_els = [tree] tree_state = tree_in_body flag_frameset_ok = true flag_parsing = true flag_foster_parenting = false afe = [] # active formatting elements # tokenizer initialization tok_state = tok_state_data # proccess input while flag_parsing t = tok_state() if t? tree_state t return tree.children # everything below is tests on the above test_equals = (description, output, expected_output) -> if output is expected_output console.log "passed." # don't say name, so smart consoles can merge all of these else console.log "FAILED: \"#{description}\"" console.log " Expected: #{expected_output}" console.log " Actual: #{output}" test_parser = (args) -> parse_errors = [] errors_cb = (i) -> parse_errors.push i parsed = parse_html args.html, errors_cb serialized = '' sep = '' for t in parsed serialized += sep sep = ',' serialized += t.serialize() if serialized isnt args.expected or parse_errors.length isnt args.errors console.log "FAILED: \"#{args.name}\"" else console.log "passed \"#{args.name}\"" if serialized isnt args.expected console.log " Input: #{args.html}" console.log " Correct: #{args.expected}" console.log " Output: #{serialized}" if parse_errors.length isnt args.errors console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}" test_parser name: "empty", \ html: "", expected: '', errors: 0 test_parser name: "just text", \ html: "abc", expected: 'text:"abc"', errors: 0 test_parser name: "named entity", \ html: "a&1234", expected: 'text:"a&1234"', errors: 0 test_parser name: "broken named character references", \ html: "1&2&&3&aabbcc;", expected: 'text:"1&2&&3&aabbcc;"', errors: 2 test_parser name: "numbered entity overrides", \ html: "1€€ ƒ", expected: 'text:"1€€ ƒ"', errors: 0 test_parser name: "open tag", \ html: "foobar", expected: 'text:"foo",tag:"span",{},[text:"bar"]', errors: 1 # no close tag test_parser name: "open tag with attributes", \ html: "foobar", expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]', errors: 1 # no close tag test_parser name: "open tag with attributes of various quotings", \ html: "foobar", expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]', errors: 1 # no close tag test_parser name: "attribute entity exceptions dq", \ html: "foobar", expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', errors: 2 # no close tag, &= in attr test_parser name: "attribute entity exceptions sq", \ html: "foo bar", expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', errors: 2 # no close tag, &= in attr test_parser name: "attribute entity exceptions uq", \ html: "foo bar", expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', errors: 2 # no close tag, &= in attr test_parser name: "matching closing tags", \ html: "foo hi

foo

bar", expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"', errors: 0 test_parser name: "missing closing tag inside", \ html: "foo

barbaz

qux", expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"', errors: 1 # close tag mismatch test_parser name: "mis-matched closing tags", \ html: "12

3456

78", expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]', errors: 2 # misplaced , no at the end test_parser name: "mis-matched formatting elements", \ html: "1234567890", expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"', errors: 1 # no idea how many their should be test_parser name: "crazy formatting elements test", \ html: "~~first~~second", # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]' # firefox does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"', errors: 6 # no idea how many there should be # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat test_parser name: "html5lib aaa 1", \ html: '

', expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]', errors: 2 test_parser name: "html5lib aaa 2", \ html: '1

', expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]', errors: 2 test_parser name: "html5lib aaa 3", \ html: '1', expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]', errors: 2 test_parser name: "html5lib aaa 4", \ html: '123', expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]', errors: 2 test_parser name: "html5lib aaa 5", \ html: '1

', expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"]text:"5"]', errors: 3