From: Jason Woofenden Date: Thu, 17 Dec 2015 03:16:08 +0000 (-0500) Subject: implemented adoption agency algorithm, tested a littl X-Git-Url: https://jasonwoof.com/gitweb/?a=commitdiff_plain;h=2fe7c46d8d2af3427d10fff4ac63d44d5314dedd;p=peach-html5-editor.git implemented adoption agency algorithm, tested a littl --- diff --git a/parse-html.coffee b/parse-html.coffee index b7421e5..5b1b175 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -31,6 +31,13 @@ TYPE_DOCTYPE = 3 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] TYPE_END_TAG = 5 # name TYPE_EOF = 6 +TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements +TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm + +# namespace constants +NS_HTML = 1 +NS_MATHML = 2 +NS_SVG = 3 class Node constructor: (type, args = {}) -> @@ -40,6 +47,13 @@ class Node @attrs = args.attrs ? {} @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only @children = args.children ? [] + @namespace = args.namespace ? NS_HTML + @parent = args.parent ? null + shallow_clone: -> # return a new node that's the same except without the children or parent + # WARNING this doesn't work right on open tags that are still being parsed + attrs = {} + attrs[k] = v for k, v of @attrs + return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace serialize: -> # for unit tests ret = '' switch @type @@ -66,9 +80,9 @@ class Node # FIXME else ret += 'unknown:' + console.log "unknown: #{JSON.stringify @}" # backtrace is just as well return ret - # helpers: (only take args that are normally known when parser creates nodes) new_open_tag = (name) -> return new Node TYPE_OPEN_TAG, name: name @@ -80,6 +94,8 @@ new_comment_node = (txt) -> return new Node TYPE_COMMENT, text: txt new_eof_token = -> return new Node TYPE_EOF +new_aaa_bookmark = -> + return new Node TYPE_AAA_BOOKMARK lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" @@ -176,29 +192,32 @@ mathml_elements = [ #normal_elements = All other allowed HTML elements are normal elements. special_elements = { - # from HTML: - address: true, applet: true, area: true, article: true, aside: true, - base: true, basefont: true, bgsound: true, blockquote: true, body: true, - br: true, button: true, caption: true, center: true, col: true, - colgroup: true, dd: true, details: true, dir: true, div: true, dl: true, - dt: true, embed: true, fieldset: true, figcaption: true, figure: true, - footer: true, form: true, frame: true, frameset: true, h1: true, h2: true, - h3: true, h4: true, h5: true, h6: true, head: true, header: true, - hgroup: true, hr: true, html: true, iframe: true, img: true, input: true, - isindex: true, li: true, link: true, listing: true, main: true, - marquee: true, meta: true, nav: true, noembed: true, noframes: true, - noscript: true, object: true, ol: true, p: true, param: true, - plaintext: true, pre: true, script: true, section: true, select: true, - source: true, style: true, summary: true, table: true, tbody: true, - td: true, template: true, textarea: true, tfoot: true, th: true, - thead: true, title: true, tr: true, track: true, ul: true, wbr: true, - xmp: true, - - # from MathML: - mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true, - - # from SVG: - foreignObject: true, desc: true, title: true + # HTML: + address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML, + aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML, + blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML, + caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML, + details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML, + embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML, + footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML, + h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML, + header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML, + img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML, + listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML, + noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML, + ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML, + script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML, + style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML, + template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, + thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML, + wbr:NS_HTML, xmp:NS_HTML, + + # MathML: + mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML, + 'annotation-xml':NS_MATHML, + + # SVG: + foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG } formatting_elements = { @@ -207,6 +226,8 @@ formatting_elements = { u: true } +el_is_special = (e) -> + return special_elements[e] is e.namespace # decode_named_char_ref() # @@ -234,12 +255,13 @@ parse_html = (txt, parse_error_cb = null) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null - open_tags = [] # stack of open elements + open_els = [] # stack of open elements tree_state = null tok_state = null tok_cur_tag = null # partially parsed tag flag_frameset_ok = null flag_parsing = null + afe = [] # active formatting elements parse_error = -> if parse_error_cb? @@ -253,19 +275,19 @@ parse_html = (txt, parse_error_cb = null) -> # But first... the helpers template_tag_is_open = -> - for t in open_tags + for t in open_els if t.type is TYPE_TAG and t.name is 'template' return true return false is_in_scope_x = (tag_name, scope) -> - for t in open_tags + for t in open_els if t.name is tag_name return true if t.name of scope return false return false is_in_scope_x_y = (tag_name, scope, scope2) -> - for t in open_tags + for t in open_els if t.name is tag_name return true if t.name of scope @@ -289,56 +311,277 @@ parse_html = (txt, parse_error_cb = null) -> is_in_table_scope = (tag_name) -> return is_in_scope_x tag_name, table_scopers is_in_select_scope = (tag_name) -> - for t in open_tags + for t in open_els if t.name is tag_name return true if t.name isnt 'optgroup' and t.name isnt 'option' return false return false + # this checks for a particular element, not by name + el_is_in_scope = (el) -> + for t in open_els + if t is el + return true + if t.name of standard_scopers + return false + return false + # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements + # this implementation is structured (mostly) as described at the link above. + # capitalized comments are the "labels" described at the link above. reconstruct_active_formatting_elements = -> - # FIXME implement this + return if afe.length is 0 + if afe[0].type is TYPE_MARKER or afe[0] in open_els + return + # Rewind + i = 0 + loop + if i is afe.length - 1 + break + i += 1 + if afe[i].type is TYPE_MARKER or afe[i] in open_els + i -= 1 # Advance + break + # Create + loop + el = afe[i].shallow_clone() + tree_insert_tag el + afe[i] = el + break if i is 0 + i -= 1 + + # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm + # adoption agency algorithm + adoption_agency = (subject) -> + if open_els[0].name is subject + el = open_els[0] + open_els.shift() + # remove it from the list of active formatting elements (if found) + for t, i in afe + if t is el + afe.splice i, 1 + break + return + outer = 0 + loop + if outer >= 8 + return + outer += 1 + fe = null + for t, fe_index in afe + if t.type is TYPE_MARKER + break + if t.name is subject + fe = t + break + if fe is null + in_body_any_other_end_tag subject + return + in_open_els = false + for t in open_els + if t is fe + in_open_els = true + break + unless in_open_els + parse_error() + # "remove it from the list" must mean afe, since it's not in open_els + afe.splice fe_index, 1 + return + unless el_is_in_scope fe + parse_error() + return + unless open_els[0] is fe + parse_error() + # continue + fb = null + fb_index + for t, i in open_els + if t is fe + break + if el_is_special t + fb = t + fb_index = i + if fb is null + loop + t = open_els.shift() + if t is fe + afe.splice fe_index, 1 + return + ca = open_els[fe_index + 1] # common ancestor + node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore + # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list. + bookmark = new_aaa_bookmark() + for t, i in afe + if t is fe + afe.splice i, 0, bookmark + node = last_node = fb + inner = 0 + loop + inner += 1 + node_next = null + for t, i in open_els + if t is node + node_next = open_els[i + 1] + break + node = node_next ? node_above + # TODO make sure node_above gets re-set if/when node is removed from open_els + if node is fe + break + node_in_afe = false + for t, i of afe + if t is node + if inner > 3 + afe.splice i, 1 + else + node_in_afe = true + break + unless node_in_afe + for t, i in open_els + if t is node + node_above = open_els[i + 1] + open_els.splice i, 1 + break + continue + # 7. reate an element for the token for which the element node + # was created, in the HTML namespace, with common ancestor as + # the intended parent; replace the entry for node in the list + # of active formatting elements with an entry for the new + # element, replace the entry for node in the stack of open + # elements with an entry for the new element, and let node be + # the new element. + new_node = node.shallow_clone() + for t, i in afe + if t is node + afe[i] = new_node + break + for t, i in open_els + if t is node + open_els[i] = new_node + break + node = new_node + # 8. If last node is furthest block, then move the + # aforementioned bookmark to be immediately after the new node + # in the list of active formatting elements. + if last_node is fb + for t, i in afe + if t is bookmark + afe.splice i, 1 + for t, i in afe + if t is node + # TODO test: position i gets you "after"? + afe.splice i, 0, new_aaa_bookmark() + # 9. Insert last node into node, first removing it from its + # previous parent node if any. + if last_node.parent? + for c, i of last_node.parent.children + if c is last_node + last_node.parent.children.splice i, 1 + node.children.push last_node + last_node.parent = node + # 10. Let last node be node. + last_node = node + # 11. Return to the step labeled inner loop. + # 14. Insert whatever last node ended up being in the previous step + # at the appropriate place for inserting a node, but using common + # ancestor as the override target. + tree_insert_tag last_node, ca + # 15. Create an element for the token for which formatting element + # was created, in the HTML namespace, with furthest block as the + # intended parent. + new_element = fe.shallow_clone() + # 16. Take all of the child nodes of furthest block and append them + # to the element created in the last step. + while fb.children.length + t = fb.children.shift() + t.parent = new_element + new_element.children.push t + # 17. Append that new element to furthest block. + new_element.parent = fb + fb.children.push new_element + # 18. Remove formatting element from the list of active formatting + # elements, and insert the new element into the list of active + # formatting elements at the position of the aforementioned + # bookmark. + for t, i in afe + if t is fe + afe.splice i, 1 + break + for t, i in afe + if t is bookmark + afe[i] = node + break + # 19. Remove formatting element from the stack of open elements, + # and insert the new element into the stack of open elements + # immediately below the position of furthest block in that stack. + for t, i of open_els + if t is fe + open_els.splice i, 1 + break + for t, i of open_els + if t is fb + open_els.splice i, 0, new_element + break + # 20. Jump back to the step labeled outer loop. # http://www.w3.org/TR/html5/syntax.html#close-a-p-element # FIXME implement this close_p_if_in_button_scope = -> - if open_tags[0].name is 'p' - open_tags.pop() + if open_els[0].name is 'p' + open_els.pop() return #p = find_button_scope 'p' #if p? # TODO generate_implied_end_tags except for p tags - # TODO parse_error unless open_tags[0].name is 'p' + # TODO parse_error unless open_els[0].name is 'p' # TODO pop stack until 'p' popped - - # http://www.w3.org/TR/html5/syntax.html#insert-a-character tree_insert_a_character = (t) -> # FIXME read spec for "adjusted insertion location, etc, this might be wrong - dest = open_tags[0].children + dest = open_els[0].children if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT dest[dest.length - 1].text += t.text else dest.push t # FIXME read spec, do this right + # FIXME implement the override target thing # note: this assumes it's an open tag - tree_insert_tag = (t) -> + tree_insert_tag = (t, override_target = null) -> t.type = TYPE_TAG # not TYPE_OPEN_TAG # convert attributes into a hash while t.attrs_a.length a = t.attrs_a.pop() t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs - open_tags[0].children.push t - open_tags.unshift t + if t.parent? + for c, i of t.parent.children + if c is t + t.parent.children.splice i, 1 + # FIXME spec says to do something to figure out what parent should be + parent = open_els[0] + open_els.unshift t + parent.children.push t + t.parent = parent # http://www.w3.org/TR/html5/syntax.html#insert-a-comment tree_insert_a_comment = (t) -> # FIXME read spec for "adjusted insertion location, etc, this might be wrong - open_tags[0].children.push t + open_els[0].children.push t # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody + in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it + for node, i in open_els + if node.name is name + # FIXME generate implied end tags except those with name==name + parse_error() unless i is 0 + while i > 0 + open_els.shift() + i -= 1 + open_els.shift() + return + if special_elements[node.name]? + parse_error() + return tree_in_body = (t) -> switch t.type when TYPE_TEXT @@ -361,7 +604,7 @@ parse_html = (txt, parse_error_cb = null) -> when 'html' parse_error() return if template_tag_is_open() - root_attrs = open_tags[open_tags.length - 1].children + root_attrs = open_els[open_els.length - 1].children for k, v of t.attrs root_attrs[k] = v unless root_attrs[k]? when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' @@ -378,11 +621,16 @@ parse_html = (txt, parse_error_cb = null) -> tree_insert_tag t when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' close_p_if_in_button_scope() - if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] parse_error() - open_tags.shift() + open_els.shift() tree_insert_tag t # TODO lots more to implement here + when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u' + reconstruct_active_formatting_elements() + tree_insert_tag t + afe.push t + # TODO lots more to implement here else # any other start tag reconstruct_active_formatting_elements() tree_insert_tag t @@ -391,7 +639,7 @@ parse_html = (txt, parse_error_cb = null) -> dd: true, dt: true, li: true, p: true, tbody: true, td: true, tfoot: true, th: true, thead: true, tr: true, body: true, html: true, } - for t in open_tags + for t in open_els unless ok_tags[t.name]? parse_error() break @@ -410,19 +658,12 @@ parse_html = (txt, parse_error_cb = null) -> return # TODO implement parse error and move to tree_after_body, reprocess # TODO lots more close tags to implement here + when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u' + adoption_agency t.name + # TODO lots more close tags to implement here else - for node, i in open_tags - if node.name is t.name - # FIXME generate implied end tags except those with name==t.name - parse_error() unless i is 0 - while i > 0 - open_tags.shift() - i -= 1 - open_tags.shift() - return - if special_elements[node.name]? - parse_error() - return + in_body_any_other_end_tag t.name + return # the functions below implement the tokenizer stats described here: @@ -768,10 +1009,11 @@ parse_html = (txt, parse_error_cb = null) -> # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data tree = new Node TYPE_TAG, name: 'html' - open_tags = [tree] + open_els = [tree] tree_state = tree_in_body flag_frameset_ok = true flag_parsing = true + afe = [] # active formatting elements # tokenizer initialization tok_state = tok_state_data @@ -872,4 +1114,10 @@ test_parser name: "mis-matched closing tags", \ test_parser name: "mis-matched formatting elements", \ html: "1234567890", expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"', - errors: 2 # FIXME dunno how many there should be + errors: 1 # no idea how many their should be +test_parser name: "crazy formatting elements test", \ + html: "second
first
", + # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]' + # firefox does this: + expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"' + errors: 6 # no idea how many there should be