X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=f0f5e9171da7f25fc089e7e24ca629d34e623727;hb=dc04008f5d598663a2bf763f169cf30594a5c4b5;hp=04733fbc6dcf33b149ed81c61094a78601c87dfc;hpb=cc2911623e5e9a50e19ae8233a990c2e633c218e;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 04733fb..f0f5e91 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -18,22 +18,140 @@ # This file implements a parser for html snippets, meant to be used by a # WYSIWYG editor. Hence it does not attempt to parse doctypes, , # or tags, nor does it produce the top level "document" node in the dom -# tree, nor nodes for html, head or body. +# tree, nor nodes for html, head or body. Comments containing "fixfull" +# indicate places where additional code is needed for full HTML document +# parsing. # -# Instead, the data structure produced by this parser is an array of nodes. +# Instead, the data structure produced by this parser is an array of Nodes. + + +# stacks/lists +# +# the spec uses a many different words do indicate which ends of lists/stacks +# they are talking about (and relative movement within the lists/stacks). This +# section splains. I'm implementing "lists" (afe and open_els) the same way +# (both as stacks) +# +# stacks grow downward (current element is index=0) +# +# example: open_els = [a, b, c, d, e, f, g] # -# Each node is an array. The first element in the array is an integer (one of -# the TYPE_* constants below) followed by the appropriate fields for that type -# (shown below in the comments after the TYPE_* definition.) +# "grows downwards" means it's visualized like this: (index: el, names) +# +# 6: g "start of the list", "topmost" +# 5: f +# 4: e "previous" (to d), "above" +# 3: d (previous/next are relative to this element) +# 2: c "next", "after", "lower", "below" +# 1: b +# 0: a "end of the list", "current node", "bottommost", "last" + + +# Each node is an obect of the Node class. Here are the Node types: TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" -TYPE_WHITESPACE = 2 -TYPE_COMMENT = 3 +TYPE_COMMENT = 2 +TYPE_DOCTYPE = 3 +# the following types are emited by the tokenizer, but shouldn't end up in the tree: +TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] +TYPE_END_TAG = 5 # name +TYPE_EOF = 6 +TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements +TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm + +# namespace constants +NS_HTML = 1 +NS_MATHML = 2 +NS_SVG = 3 -alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789" -hex_chars = "0123456789abcdefABCDEF" +g_debug_log = [] +debug_log_reset = -> + g_debug_log = [] +debug_log = (str) -> + g_debug_log.push str +debug_log_each = (cb) -> + for str in g_debug_log + cb str + +prev_node_id = 0 +class Node + constructor: (type, args = {}) -> + @type = type # one of the TYPE_* constants above + @name = args.name ? '' # tag name + @text = args.text ? '' # contents for text/comment nodes + @attrs = args.attrs ? {} + @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only + @children = args.children ? [] + @namespace = args.namespace ? NS_HTML + @parent = args.parent ? null + if args.id? + @id = "#{args.id}+" + else + @id = "#{++prev_node_id}" + shallow_clone: -> # return a new node that's the same except without the children or parent + # WARNING this doesn't work right on open tags that are still being parsed + attrs = {} + attrs[k] = v for k, v of @attrs + return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id + serialize: (shallow = false, show_ids = false) -> # for unit tests + ret = '' + switch @type + when TYPE_TAG + ret += 'tag:' + ret += JSON.stringify @name + ret += ',' + if show_ids + ret += "##{@id}," + if shallow + break + ret += JSON.stringify @attrs + ret += ',[' + sep = '' + for c in @children + ret += sep + sep = ',' + ret += c.serialize shallow, show_ids + ret += ']' + when TYPE_TEXT + ret += 'text:' + ret += JSON.stringify @text + when TYPE_COMMENT + ret += 'comment:' + ret += JSON.stringify @text + when TYPE_DOCTYPE + ret += 'doctype' + # FIXME + when TYPE_MARKER + ret += 'marker' + when TYPE_AAA_BOOKMARK + ret += 'aaa_bookmark' + else + ret += 'unknown:' + console.log "unknown: #{JSON.stringify @}" # backtrace is just as well + return ret + +# helpers: (only take args that are normally known when parser creates nodes) +new_open_tag = (name) -> + return new Node TYPE_OPEN_TAG, name: name +new_end_tag = (name) -> + return new Node TYPE_END_TAG, name: name +new_element = (name) -> + return new Node TYPE_TAG, name: name +new_text_node = (txt) -> + return new Node TYPE_TEXT, text: txt +new_comment_node = (txt) -> + return new Node TYPE_COMMENT, text: txt +new_eof_token = -> + return new Node TYPE_EOF +new_aaa_bookmark = -> + return new Node TYPE_AAA_BOOKMARK + +lc_alpha = "abcdefghijklmnopqrstuvwxqz" +uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" +alnum = lc_alpha + uc_alpha + digits +hex_chars = digits + "abcdefABCDEF" # some SVG elements have dashes in them tag_name_chars = alnum + "-" @@ -123,6 +241,59 @@ mathml_elements = [ # foreign_elements = [svg_elements..., mathml_elements...] #normal_elements = All other allowed HTML elements are normal elements. +special_elements = { + # HTML: + address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML, + aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML, + blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML, + caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML, + details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML, + embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML, + footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML, + h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML, + header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML, + img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML, + listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML, + noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML, + ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML, + script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML, + style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML, + template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, + thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML, + wbr:NS_HTML, xmp:NS_HTML, + + # MathML: + mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML, + 'annotation-xml':NS_MATHML, + + # SVG: + foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG +} + +formatting_elements = { + a: true, b: true, big: true, code: true, em: true, font: true, i: true, + nobr: true, s: true, small: true, strike: true, strong: true, tt: true, + u: true +} + +# all html I presume +end_tag_implied = { + dd: true + dt: true + li: true + option: true + optgroup: true + p: true + rb: true + rp: true + rt: true + rtc: true +} + +el_is_special = (e) -> + return special_elements[e.name]? + # FIXME it should really be: + #return special_elements[e.name] is e.namespace # decode_named_char_ref() # @@ -146,44 +317,967 @@ decode_named_char_ref = (txt) -> return null if decoded is txt return g_dncr.cache[txt] = decoded -parse_html = (txt) -> +parse_html = (txt, parse_error_cb = null) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null - tree_append_point = null + open_els = [] # stack of open elements tree_state = null tok_state = null + tok_cur_tag = null # partially parsed tag + flag_frameset_ok = null + flag_parsing = null + flag_foster_parenting = null + afe = [] # active formatting elements + + parse_error = -> + if parse_error_cb? + parse_error_cb cur + else + console.log "Parse error at character #{cur} of #{txt.length}" + + + # the functions below impliment the Tree Contstruction algorithm + # http://www.w3.org/TR/html5/syntax.html#tree-construction + + # But first... the helpers + template_tag_is_open = -> + for t in open_els + if t.name is 'template' # maybe should also check: and t.namespace is 'html' + return true + return false + is_in_scope_x = (tag_name, scope, namespace) -> + for t in open_els + if t.name is tag_name and (namespace is null or namespace is t.namespace) + return true + if scope[t.name] is t.namespace + return false + return false + is_in_scope_x_y = (tag_name, scope, scope2, namespace) -> + for t in open_els + if t.name is tag_name and (namespace is null or namespace is t.namespace) + return true + if scope[t.name] is t.namespace + return false + if scope2[t.name] is t.namespace + return false + return false + standard_scopers = { # FIXME these are supposed to be namespace specific + applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML, + td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML, + template: NS_HTML, mi: NS_MATHML, + + mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML, + 'annotation-xml': NS_MATHML, + + foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG + } + button_scopers = button: NS_HTML + li_scopers = ol: NS_HTML, ul: NS_HTML + table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML + is_in_scope = (tag_name, namespace = null) -> + return is_in_scope_x tag_name, standard_scopers, namespace + is_in_button_scope = (tag_name, namespace = null) -> + return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace + is_in_table_scope = (tag_name, namespace = null) -> + return is_in_scope_x tag_name, table_scopers, namespace + is_in_select_scope = (tag_name, namespace = null) -> + for t in open_els + if t.name is tag_name and (namespace is null or namespace is t.namespace) + return true + if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option' + return false + return false + # this checks for a particular element, not by name + el_is_in_scope = (el) -> + for t in open_els + if t is el + return true + if standard_scopers[t.name] is t.namespace + return false + return false + + # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements + # this implementation is structured (mostly) as described at the link above. + # capitalized comments are the "labels" described at the link above. + reconstruct_active_formatting_elements = -> + return if afe.length is 0 + if afe[0].type is TYPE_MARKER or afe[0] in open_els + return + # Rewind + i = 0 + loop + if i is afe.length - 1 + break + i += 1 + if afe[i].type is TYPE_MARKER or afe[i] in open_els + i -= 1 # Advance + break + # Create + loop + el = afe[i].shallow_clone() + tree_insert_element el + afe[i] = el + break if i is 0 + i -= 1 + + # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm + # adoption agency algorithm + # overview here: + # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i + # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p + # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements + adoption_agency = (subject) -> + if open_els[0].name is subject + el = open_els[0] + open_els.shift() + # remove it from the list of active formatting elements (if found) + for t, i in afe + if t is el + afe.splice i, 1 + break + return + outer = 0 + loop + if outer >= 8 + return + outer += 1 + # 5. Let formatting element be the last element in the list of + # active formatting elements that: is between the end of the list + # and the last scope marker in the list, if any, or the start of + # the list otherwise, and has the tag name subject. + fe = null + for t, fe_of_afe in afe + if t.type is TYPE_MARKER + break + if t.name is subject + fe = t + break + # If there is no such element, then abort these steps and instead + # act as described in the "any other end tag" entry above. + if fe is null + in_body_any_other_end_tag subject + return + # 6. If formatting element is not in the stack of open elements, + # then this is a parse error; remove the element from the list, and + # abort these steps. + in_open_els = false + for t, fe_of_open_els in open_els + if t is fe + in_open_els = true + break + unless in_open_els + parse_error() + # "remove it from the list" must mean afe, since it's not in open_els + afe.splice fe_of_afe, 1 + return + # 7. If formatting element is in the stack of open elements, but + # the element is not in scope, then this is a parse error; abort + # these steps. + unless el_is_in_scope fe + parse_error() + return + # 8. If formatting element is not the current node, this is a parse + # error. (But do not abort these steps.) + unless open_els[0] is fe + parse_error() + # continue + # 9. Let furthest block be the topmost node in the stack of open + # elements that is lower in the stack than formatting element, and + # is an element in the special category. There might not be one. + fb = null + fb_of_open_els = null + for t, i in open_els + if t is fe + break + if el_is_special t + fb = t + fb_of_open_els = i + # and continue, to see if there's one that's more "topmost" + # 10. If there is no furthest block, then the UA must first pop all + # the nodes from the bottom of the stack of open elements, from the + # current node up to and including formatting element, then remove + # formatting element from the list of active formatting elements, + # and finally abort these steps. + if fb is null + loop + t = open_els.shift() + if t is fe + afe.splice fe_of_afe, 1 + return + # 11. Let common ancestor be the element immediately above + # formatting element in the stack of open elements. + ca = open_els[fe_of_open_els + 1] # common ancestor + + node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore + # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list. + bookmark = new_aaa_bookmark() + for t, i in afe + if t is fe + afe.splice i, 0, bookmark + break + node = last_node = fb + inner = 0 + loop + inner += 1 + # 3. Let node be the element immediately above node in the + # stack of open elements, or if node is no longer in the stack + # of open elements (e.g. because it got removed by this + # algorithm), the element that was immediately above node in + # the stack of open elements before node was removed. + node_next = null + for t, i in open_els + if t is node + node_next = open_els[i + 1] + break + node = node_next ? node_above + debug_log "inner loop #{inner}" + debug_log "open_els: #{serialize_els open_els, true, true}" + debug_log "tree: #{serialize_els tree.children, false, true}" + debug_log "afe: #{serialize_els afe, true, true}" + debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}" + debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}" + debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}" + debug_log "node: #{node.serialize true, true}" + # TODO make sure node_above gets re-set if/when node is removed from open_els + + # 4. If node is formatting element, then go to the next step in + # the overall algorithm. + if node is fe + break + debug_log "the meat" + # 5. If inner loop counter is greater than three and node is in + # the list of active formatting elements, then remove node from + # the list of active formatting elements. + node_in_afe = false + for t, i in afe + if t is node + if inner > 3 + afe.splice i, 1 + debug_log "max out inner" + else + node_in_afe = true + debug_log "in afe" + break + # 6. If node is not in the list of active formatting elements, + # then remove node from the stack of open elements and then go + # back to the step labeled inner loop. + unless node_in_afe + debug_log "not in afe" + for t, i in open_els + if t is node + node_above = open_els[i + 1] + open_els.splice i, 1 + break + continue + debug_log "the bones" + # 7. create an element for the token for which the element node + # was created, in the HTML namespace, with common ancestor as + # the intended parent; replace the entry for node in the list + # of active formatting elements with an entry for the new + # element, replace the entry for node in the stack of open + # elements with an entry for the new element, and let node be + # the new element. + new_node = node.shallow_clone() + for t, i in afe + if t is node + afe[i] = new_node + debug_log "replaced in afe" + break + for t, i in open_els + if t is node + node_above = open_els[i + 1] + open_els[i] = new_node + debug_log "replaced in open_els" + break + node = new_node + # 8. If last node is furthest block, then move the + # aforementioned bookmark to be immediately after the new node + # in the list of active formatting elements. + if last_node is fb + for t, i in afe + if t is bookmark + afe.splice i, 1 + debug_log "removed bookmark" + break + for t, i in afe + if t is node + # "after" means lower + afe.splice i, 0, bookmark # "after as <- + debug_log "placed bookmark after node" + debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}" + break + # 9. Insert last node into node, first removing it from its + # previous parent node if any. + if last_node.parent? + debug_log "last_node has parent" + for c, i in last_node.parent.children + if c is last_node + debug_log "removing last_node from parent" + last_node.parent.children.splice i, 1 + break + node.children.push last_node + last_node.parent = node + # 10. Let last node be node. + last_node = node + debug_log "at last" + # 11. Return to the step labeled inner loop. + # 14. Insert whatever last node ended up being in the previous step + # at the appropriate place for inserting a node, but using common + # ancestor as the override target. + + # JASON: In the case where fe is immediately followed by fb: + # * inner loop exits out early (node==fe) + # * last_node is fb + # * last_node is still in the tree (not a duplicate) + if last_node.parent? + debug_log "FEFIRST? last_node has parent" + for c, i in last_node.parent.children + if c is last_node + debug_log "removing last_node from parent" + last_node.parent.children.splice i, 1 + break + + debug_log "after aaa inner loop" + debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}" + debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}" + debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}" + debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}" + debug_log "tree: #{serialize_els tree.children, false, true}" + + debug_log "insert" + + + # can't use standard insert token thing, because it's already in + # open_els and must stay at it's current position in open_els + dest = adjusted_insertion_location ca + dest[0].children.splice dest[1], 0, last_node + last_node.parent = dest[0] + + + debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}" + debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}" + debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}" + debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}" + debug_log "tree: #{serialize_els tree.children, false, true}" + + # 15. Create an element for the token for which formatting element + # was created, in the HTML namespace, with furthest block as the + # intended parent. + new_element = fe.shallow_clone() # FIXME intended parent thing + # 16. Take all of the child nodes of furthest block and append them + # to the element created in the last step. + while fb.children.length + t = fb.children.shift() + t.parent = new_element + new_element.children.push t + # 17. Append that new element to furthest block. + new_element.parent = fb + fb.children.push new_element + # 18. Remove formatting element from the list of active formatting + # elements, and insert the new element into the list of active + # formatting elements at the position of the aforementioned + # bookmark. + for t, i in afe + if t is fe + afe.splice i, 1 + break + for t, i in afe + if t is bookmark + afe[i] = new_element + break + # 19. Remove formatting element from the stack of open elements, + # and insert the new element into the stack of open elements + # immediately below the position of furthest block in that stack. + for t, i in open_els + if t is fe + open_els.splice i, 1 + break + for t, i in open_els + if t is fb + open_els.splice i, 0, new_element + break + # 20. Jump back to the step labeled outer loop. + debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}" + debug_log "tree: #{serialize_els tree.children, false, true}" + debug_log "open_els: #{serialize_els open_els, true, true}" + debug_log "afe: #{serialize_els afe, true, true}" + debug_log "AAA DONE" + + # http://www.w3.org/TR/html5/syntax.html#close-a-p-element + # FIXME test this (particularly emplied end tags) + close_p_element = -> + generate_implied_end_tags 'p' # arg is exception + if open_els[0].name isnt 'p' + parse_error() + while open_els.length > 1 # just in case + t = open_els.shift() + if t.name is 'p' + return + close_p_if_in_button_scope = -> + if is_in_button_scope 'p' + close_a_p_element() + + # http://www.w3.org/TR/html5/syntax.html#insert-a-character + tree_insert_text = (t) -> + dest = adjusted_insertion_location() + if dest[1] > 0 + prev = dest[0].children[dest[1] - 1] + if prev.type is TYPE_TEXT + prev.text += t.text + return + dest[0].children.splice dest[1], 0, t + + # 8.2.5.1 + # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes + # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node + adjusted_insertion_location = (override_target = null) -> + # 1. If there was an override target specified, then let target be the + # override target. + if override_target? + target = override_target + else # Otherwise, let target be the current node. + target = open_els[0] + # 2. Determine the adjusted insertion location using the first matching + # steps from the following list: + # + # If foster parenting is enabled and target is a table, tbody, tfoot, + # thead, or tr element Foster parenting happens when content is + # misnested in tables. + if flag_foster_parenting and target.name in foster_parenting_targets + console.log "foster parenting isn't implemented yet" # TODO + # 1. Let last template be the last template element in the stack of + # open elements, if any. + # 2. Let last table be the last table element in the stack of open + # elements, if any. + + # 3. If there is a last template and either there is no last table, + # or there is one, but last template is lower (more recently added) + # than last table in the stack of open elements, then: let adjusted + # insertion location be inside last template's template contents, + # after its last child (if any), and abort these substeps. + + # 4. If there is no last table, then let adjusted insertion + # location be inside the first element in the stack of open + # elements (the html element), after its last child (if any), and + # abort these substeps. (fragment case) + + # 5. If last table has a parent element, then let adjusted + # insertion location be inside last table's parent element, + # immediately before last table, and abort these substeps. + + # 6. Let previous element be the element immediately above last + # table in the stack of open elements. + + # 7. Let adjusted insertion location be inside previous element, + # after its last child (if any). + + # Note: These steps are involved in part because it's possible for + # elements, the table element in this case in particular, to have + # been moved by a script around in the DOM, or indeed removed from + # the DOM entirely, after the element was inserted by the parser. + else + # Otherwise Let adjusted insertion location be inside target, after + # its last child (if any). + target_i = target.children.length + + # 3. If the adjusted insertion location is inside a template element, + # let it instead be inside the template element's template contents, + # after its last child (if any). TODO + + # 4. Return the adjusted insertion location. + return [target, target_i] + + # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token + # aka create_an_element_for_token + token_to_element = (t, namespace, intended_parent) -> + t.type = TYPE_TAG # not TYPE_OPEN_TAG + # convert attributes into a hash + attrs = {} + while t.attrs_a.length + a = t.attrs_a.pop() + attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs + el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs + + # TODO 2. If the newly created element has an xmlns attribute in the + # XMLNS namespace whose value is not exactly the same as the element's + # namespace, that is a parse error. Similarly, if the newly created + # element has an xmlns:xlink attribute in the XMLNS namespace whose + # value is not the XLink Namespace, that is a parse error. + + # fixfull: the spec says stuff about form pointers and ownerDocument + + return el + + # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element + insert_foreign_element = (token, namespace) -> + ail = adjusted_insertion_location() + ail_el = ail[0] + ail_i = ail[1] + el = token_to_element token, namespace, ail_el + # TODO skip this next step if it's broken (eg ail_el is document with child already) + el.parent = ail_el + ail_el.children.splice ail_i, 0, el + open_els.unshift el + return el + # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element + insert_html_element = insert_foreign_element # (token, namespace) -> + + # FIXME read implement "foster parenting" part + # FIXME read spec, do this right + # FIXME implement the override target thing + # note: this assumes it's an open tag + # FIXME what part of the spec is this? + # TODO look through all callers of this, and see what they should really be doing. + # eg probably insert_html_element for tokens + tree_insert_element = (el, override_target = null, namespace = null) -> + if namespace? + el.namespace = namespace + dest = adjusted_insertion_location override_target + if el.type is TYPE_OPEN_TAG # means it's a "token" + el = token_to_element el, namespace, dest[0] + unless el.namespace? + namespace = dest.namespace + # fixfull: Document nodes sometimes can't accept more chidren + dest[0].children.splice dest[1], 0, el + el.parent = dest[0] + open_els.unshift el + return el + + # http://www.w3.org/TR/html5/syntax.html#insert-a-comment + tree_insert_a_comment = (t) -> + # FIXME read spec for "adjusted insertion location, etc, this might be wrong + open_els[0].children.push t + + # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags + # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags + generate_implied_end_tags = (except = null) -> + while end_tag_implied[open_els[0]] and open_els[0].name isnt except + open_els.shift() + + # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody + in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it + for node, i in open_els + if node.name is name # FIXME check namespace too + generate_implied_end_tags name # arg is exception + parse_error() unless i is 0 + while i >= 0 + open_els.shift() + i -= 1 + return + if special_elements[node.name]? # FIXME check namespac too + parse_error() + return + tree_in_body = (t) -> + switch t.type + when TYPE_TEXT + switch t.text + when "\u0000" + parse_error() + when "\t", "\u000a", "\u000c", "\u000d", ' ' + reconstruct_active_formatting_elements() + tree_insert_text t + else + reconstruct_active_formatting_elements() + tree_insert_text t + flag_frameset_ok = false + when TYPE_COMMENT + tree_insert_a_comment t + when TYPE_DOCTYPE + parse_error() + when TYPE_OPEN_TAG + switch t.name + when 'html' + parse_error() + return if template_tag_is_open() + root_attrs = open_els[open_els.length - 1].attrs + for k, v of t.attrs + root_attrs[k] = v unless root_attrs[k]? + when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' + # FIXME also do this for (end tag) + return tree_in_head t + when 'body' + parse_error() + # TODO + when 'frameset' + parse_error() + # TODO + when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul' + close_p_if_in_button_scope() + insert_html_element t + when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' + close_p_if_in_button_scope() + if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + parse_error() + open_els.shift() + insert_html_element t + # TODO lots more to implement here + when 'a' + # If the list of active formatting elements + # contains an a element between the end of the list and + # the last marker on the list (or the start of the list + # if there is no marker on the list), then this is a + # parse error; run the adoption agency algorithm for + # the tag name "a", then remove that element from the + # list of active formatting elements and the stack of + # open elements if the adoption agency algorithm didn't + # already remove it (it might not have if the element + # is not in table scope). + found = false + for el in afe + if el.type is TYPE_MARKER + break + if el.name is 'a' + found = el + if found? + parse_error() + adoption_agency 'a' + for el, i in afe + if el is found + afe.splice i, 1 + for el, i in open_els + if el is found + open_els.splice i, 1 + reconstruct_active_formatting_elements() + el = tree_insert_element t + afe.unshift el + when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u' + reconstruct_active_formatting_elements() + el = tree_insert_element t + afe.unshift el + # TODO lots more to implement here + else # any other start tag + reconstruct_active_formatting_elements() + tree_insert_element t + when TYPE_EOF + ok_tags = { + dd: true, dt: true, li: true, p: true, tbody: true, td: true, + tfoot: true, th: true, thead: true, tr: true, body: true, html: true, + } + for t in open_els + unless ok_tags[t.name]? + parse_error() + break + # TODO stack of template insertion modes thing + flag_parsing = false # stop parsing + when TYPE_END_TAG + switch t.name + when 'body' + unless is_in_scope 'body' + parse_error() + return + # TODO implement parse error and move to tree_after_body + when 'html' + unless is_in_scope 'body' # weird, but it's what the spec says + parse_error() + return + # TODO implement parse error and move to tree_after_body, reprocess + when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul' + unless is_in_scope t.name, NS_HTML + parse_error() + return + generate_implied_end_tags() + unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML + parse_error() + loop + el = open_els.shift() + if el.name is t.name and el.namespace is NS_HTML + return + # TODO lots more close tags to implement here + when 'p' + unless is_in_button_scope 'p' + parse_error() + insert_html_element new_open_tag 'p' + close_p_element() + # TODO lots more close tags to implement here + when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u' + adoption_agency t.name + # TODO lots more close tags to implement here + else + in_body_any_other_end_tag t.name + return # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization + # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> - if cur >= txt.length - return null switch c = txt.charAt(cur++) when '&' - tok_state = tok_state_character_reference_in_data + return new_text_node tokenize_character_reference() when '<' tok_state = tok_state_tag_open when "\u0000" + parse_error() + return new_text_node c + when '' # EOF + return new_eof_token() + else + return new_text_node c + return null + + # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state + # not needed: tok_state_character_reference_in_data = -> + # just call tok_state_character_reference_in_data() + + # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state + tok_state_tag_open = -> + switch c = txt.charAt(cur++) + when '!' + tok_state = tok_state_markup_declaration_open + when '/' + tok_state = tok_state_end_tag_open + when '?' + parse_error() + tok_state = tok_state_bogus_comment + else + if lc_alpha.indexOf(c) > -1 + tok_cur_tag = new_open_tag c + tok_state = tok_state_tag_name + else if uc_alpha.indexOf(c) > -1 + tok_cur_tag = new_open_tag c.toLowerCase() + tok_state = tok_state_tag_name + else + parse_error() + tok_state = tok_state_data + cur -= 1 # we didn't parse/handle the char after < + return new_text_node '<' + return null + + # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state + tok_state_end_tag_open = -> + switch c = txt.charAt(cur++) + when '>' + parse_error() + tok_state = tok_state_data + when '' # EOF + parse_error() + tok_state = tok_state_data + return new_text_node ' -1 + tok_cur_tag = new_end_tag c.toLowerCase() + tok_state = tok_state_tag_name + else if lc_alpha.indexOf(c) > -1 + tok_cur_tag = new_end_tag c + tok_state = tok_state_tag_name + else + parse_error() + tok_state = tok_state_bogus_comment + return null + + # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state + tok_state_tag_name = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + tok_cur_tag.name += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + tok_cur_tag.name += c.toLowerCase() + else + tok_cur_tag.name += c + return null + + # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state + tok_state_before_attribute_name = -> + attr_name = null + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '/' + tok_state = tok_state_self_closing_start_tag + return null + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + attr_name = "\ufffd" + when '"', "'", '<', '=' + parse_error() + attr_name = c + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + attr_name = c.toLowerCase() + else + attr_name = c + if attr_name? + tok_cur_tag.attrs_a.unshift [attr_name, ''] + tok_state = tok_state_attribute_name + return null + + # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state + tok_state_attribute_name = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_after_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '=' + tok_state = tok_state_before_attribute_value + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + tok_cur_tag.attrs_a[0][0] = "\ufffd" + when '"', "'", '<' + parse_error() + tok_cur_tag.attrs_a[0][0] = c + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + tok_cur_tag.attrs_a[0][0] = c.toLowerCase() + else + tok_cur_tag.attrs_a[0][0] += c + return null + + # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state + tok_state_before_attribute_value = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '"' + tok_state = tok_state_attribute_value_double_quoted + when '&' + tok_state = tok_state_attribute_value_unquoted + cur -= 1 + when "'" + tok_state = tok_state_attribute_value_single_quoted + when "\u0000" + # Parse error + tok_cur_tag.attrs_a[0][1] += "\ufffd" + tok_state = tok_state_attribute_value_unquoted + when '>' + # Parse error + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag.attrs_a[0][1] += c + tok_state = tok_state_attribute_value_unquoted + return null + + # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state + tok_state_attribute_value_double_quoted = -> + switch c = txt.charAt(cur++) + when '"' + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true + when "\u0000" + # Parse error + tok_cur_tag.attrs_a[0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag.attrs_a[0][1] += c + return null + + # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state + tok_state_attribute_value_single_quoted = -> + switch c = txt.charAt(cur++) + when "'" + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true + when "\u0000" # Parse error - return [TYPE_TEXT, c] + tok_cur_tag.attrs_a[0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else - return [TYPE_TEXT, c] + tok_cur_tag.attrs_a[0][1] += c return null - # & just got consumed - tok_state_character_reference_in_data = -> - tok_state = tok_state_data + # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state + tok_state_attribute_value_unquoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '&' + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + tok_cur_tag.attrs_a[0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + # Parse Error if ', <, = or ` (backtick) + tok_cur_tag.attrs_a[0][1] += c + return null + + # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state + tok_state_after_attribute_value_quoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data + else + # Parse Error + tok_state = tok_state_before_attribute_name + cur -= 1 # we didn't handle that char + return null + + # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # Don't set this as a state, just call it + # returns a string (NOT a text node) + tokenize_character_reference = (allowed_char = null, in_attr = false) -> if cur >= txt.length - return [TYPE_TEXT, '&'] + return '&' switch c = txt.charAt(cur) + when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char + # explicitly not a parse error + return '&' when ';' - return [TYPE_TEXT, '&'] + # there has to be "one or more" alnums between & and ; to be a parse error + return '&' when '#' if cur + 1 >= txt.length - return [TYPE_TEXT, '&'] + return '&' if txt.charAt(cur + 1).toLowerCase() is 'x' prefix = '#x' charset = hex_chars @@ -196,78 +1290,203 @@ parse_html = (txt) -> while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 i += 1 if i is 0 - return [TYPE_TEXT, '&'] + return '&' if txt.charAt(start + i) is ';' i += 1 + # FIXME This is supposed to generate parse errors for some chars decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) if decoded? cur = start + i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] + return decoded + return '&' else for i in [0...31] if alnum.indexOf(txt.charAt(cur + i)) is -1 break if i is 0 - return [TYPE_TEXT, '&'] + # exit early, because parse_error() below needs at least one alnum + return '&' if txt.charAt(cur + i) is ';' i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) if decoded? cur += i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] + return decoded + parse_error() + return '&' else # no ';' terminator (only legacy char refs) - if i < 2 or i > 6 - return [TYPE_TEXT, '&'] - # FIXME: if we're inside an attribute: - # 1. don't parse refs that are followed by = - # 2. don't parse refs that are followed by alnum max = i for i in [2..max] # no prefix matches, so ok to check shortest first c = legacy_char_refs[txt.substr(cur, i)] if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char cur += i # consume entity chars - return [TYPE_TEXT, c] - return null - - - # the functions below impliment the Tree Contstruction algorithm here: - # http://www.w3.org/TR/html5/syntax.html#tree-construction - tree_append = (t) -> - if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT - tree_append_point[tree_append_point.length - 1][1] += t[1] - else - tree_append_point.push t + parse_error() # because no terminating ";" + return c + parse_error() + return '&' + return # never reached # tree constructor initialization - tree = [] # see comments on TYPE_TAG/etc for the structure of this data - tree_append_point = tree - tree_state = tree_append + # see comments on TYPE_TAG/etc for the structure of this data + tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + open_els = [tree] + tree_state = tree_in_body + flag_frameset_ok = true + flag_parsing = true + flag_foster_parenting = false + afe = [] # active formatting elements # tokenizer initialization tok_state = tok_state_data # proccess input - while cur < txt.length + while flag_parsing t = tok_state() if t? tree_state t - - return tree + return tree.children # everything below is tests on the above -test_equals = (description, fn, args..., expected_output) -> - output = fn.apply this, args +test_equals = (description, output, expected_output) -> if output is expected_output - console.log "passed: #{description}." + console.log "passed." # don't say name, so smart consoles can merge all of these else - console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}" -html_to_json = (html) -> - return JSON.stringify parse_html html -test_equals "empty", html_to_json, "", '[]' -test_equals "just text", html_to_json, "abc", '[[1,"abc"]]' -test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' -test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' -test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1â¬â¬ Æ"]]' + console.log "FAILED: \"#{description}\"" + console.log " Expected: #{expected_output}" + console.log " Actual: #{output}" +serialize_els = (els, shallow, show_ids) -> + serialized = '' + sep = '' + for t in els + serialized += sep + sep = ',' + serialized += t.serialize shallow, show_ids + return serialized +test_parser = (args) -> + debug_log_reset() + parse_errors = [] + errors_cb = (i) -> + parse_errors.push i + prev_node_id = 0 # reset counter + parsed = parse_html args.html, errors_cb + serialized = serialize_els parsed, false, false + if serialized isnt args.expected # or parse_errors.length isnt args.errors + debug_log_each (str) -> + console.log str + console.log "FAILED: \"#{args.name}\"" + else + console.log "passed \"#{args.name}\"" + if serialized isnt args.expected + console.log " Input: #{args.html}" + console.log " Correct: #{args.expected}" + console.log " Output: #{serialized}" + if parse_errors.length isnt args.errors + console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}" + +test_parser name: "empty", \ + html: "", + expected: '', + errors: 0 +test_parser name: "just text", \ + html: "abc", + expected: 'text:"abc"', + errors: 0 +test_parser name: "named entity", \ + html: "a&1234", + expected: 'text:"a&1234"', + errors: 0 +test_parser name: "broken named character references", \ + html: "1&2&&3&aabbcc;", + expected: 'text:"1&2&&3&aabbcc;"', + errors: 2 +test_parser name: "numbered entity overrides", \ + html: "1€€ ƒ", + expected: 'text:"1â¬â¬ Æ"', + errors: 0 +test_parser name: "open tag", \ + html: "foobar", + expected: 'text:"foo",tag:"span",{},[text:"bar"]', + errors: 1 # no close tag +test_parser name: "open tag with attributes", \ + html: "foobar", + expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]', + errors: 1 # no close tag +test_parser name: "open tag with attributes of various quotings", \ + html: "foobar", + expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]', + errors: 1 # no close tag +test_parser name: "attribute entity exceptions dq", \ + html: "foobar", + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', + errors: 2 # no close tag, &= in attr +test_parser name: "attribute entity exceptions sq", \ + html: "foo bar", + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', + errors: 2 # no close tag, &= in attr +test_parser name: "attribute entity exceptions uq", \ + html: "foo bar", + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', + errors: 2 # no close tag, &= in attr +test_parser name: "matching closing tags", \ + html: "foo hi

foo

bar", + expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"', + errors: 0 +test_parser name: "missing closing tag inside", \ + html: "foo

barbaz

qux", + expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"', + errors: 1 # close tag mismatch +test_parser name: "mis-matched closing tags", \ + html: "12

3456

78", + expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]', + errors: 2 # misplaced , no at the end +test_parser name: "mis-matched formatting elements", \ + html: "1234567890", + expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"', + errors: 1 # no idea how many their should be +test_parser name: "8.2.8.1 Misnested tags: ", \ + html: '

12345

', + expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]', + errors: 1 +test_parser name: "8.2.8.2 Misnested tags:

", \ + html: '1

', + expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]', + errors: 1 +test_parser name: "crazy formatting elements test", \ + html: "~~first~~second", + # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]' + # firefox does this: + expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"', + errors: 6 # no idea how many there should be +# tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat +test_parser name: "html5lib aaa 1", \ + html: '

', + expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]', + errors: 2 +test_parser name: "html5lib aaa 2", \ + html: '1

', + expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]', + errors: 2 +test_parser name: "html5lib aaa 3", \ + html: '1', + expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]', + errors: 2 +test_parser name: "html5lib aaa 4", \ + html: '123', + expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]', + errors: 2 +test_parser name: "html5lib aaa 5", \ + html: '1

', + expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]', + errors: 3