# This file implements a parser for html snippets, meant to be used by a
# WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
# or <body> tags, nor does it produce the top level "document" node in the dom
-# tree, nor nodes for html, head or body.
+# tree, nor nodes for html, head or body. Comments containing "fixfull"
+# indicate places where additional code is needed for full HTML document
+# parsing.
#
# Instead, the data structure produced by this parser is an array of nodes.
#
-# Each node is an array. The first element in the array is an integer (one of
-# the TYPE_* constants below) followed by the appropriate fields for that type
-# (shown below in the comments after the TYPE_* definition.)
-
+# Each node is an obect of the Node class. Here are the Node types:
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
TYPE_COMMENT = 2
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
TYPE_END_TAG = 5 # name
TYPE_EOF = 6
+TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
+TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
+
+# namespace constants
+NS_HTML = 1
+NS_MATHML = 2
+NS_SVG = 3
+
+class Node
+ constructor: (type, args = {}) ->
+ @type = type # one of the TYPE_* constants above
+ @name = args.name ? '' # tag name
+ @text = args.text ? '' # contents for text/comment nodes
+ @attrs = args.attrs ? {}
+ @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
+ @children = args.children ? []
+ @namespace = args.namespace ? NS_HTML
+ @parent = args.parent ? null
+ shallow_clone: -> # return a new node that's the same except without the children or parent
+ # WARNING this doesn't work right on open tags that are still being parsed
+ attrs = {}
+ attrs[k] = v for k, v of @attrs
+ return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
+ serialize: -> # for unit tests
+ ret = ''
+ switch @type
+ when TYPE_TAG
+ ret += 'tag:'
+ ret += JSON.stringify @name
+ ret += ','
+ ret += JSON.stringify @attrs
+ ret += ',['
+ sep = ''
+ for c in @children
+ ret += sep
+ sep = ','
+ ret += c.serialize()
+ ret += ']'
+ when TYPE_TEXT
+ ret += 'text:'
+ ret += JSON.stringify @text
+ when TYPE_COMMENT
+ ret += 'comment:'
+ ret += JSON.stringify @text
+ when TYPE_DOCTYPE
+ ret += 'doctype'
+ # FIXME
+ else
+ ret += 'unknown:'
+ console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
+ return ret
+
+# helpers: (only take args that are normally known when parser creates nodes)
+new_open_tag = (name) ->
+ return new Node TYPE_OPEN_TAG, name: name
+new_end_tag = (name) ->
+ return new Node TYPE_END_TAG, name: name
+new_text_node = (txt) ->
+ return new Node TYPE_TEXT, text: txt
+new_comment_node = (txt) ->
+ return new Node TYPE_COMMENT, text: txt
+new_eof_token = ->
+ return new Node TYPE_EOF
+new_aaa_bookmark = ->
+ return new Node TYPE_AAA_BOOKMARK
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
digits = "0123456789"
alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
-scopers = { # FIXME these are supposed to be namespace specific
- 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
- 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
- 'foreignObject', 'desc', 'title'
-}
# some SVG elements have dashes in them
tag_name_chars = alnum + "-"
#normal_elements = All other allowed HTML elements are normal elements.
special_elements = {
- # from HTML:
- address: true, applet: true, area: true, article: true, aside: true,
- base: true, basefont: true, bgsound: true, blockquote: true, body: true,
- br: true, button: true, caption: true, center: true, col: true,
- colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
- dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
- footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
- h3: true, h4: true, h5: true, h6: true, head: true, header: true,
- hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
- isindex: true, li: true, link: true, listing: true, main: true,
- marquee: true, meta: true, nav: true, noembed: true, noframes: true,
- noscript: true, object: true, ol: true, p: true, param: true,
- plaintext: true, pre: true, script: true, section: true, select: true,
- source: true, style: true, summary: true, table: true, tbody: true,
- td: true, template: true, textarea: true, tfoot: true, th: true,
- thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
- xmp: true,
-
- # from MathML:
- mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
-
- # from SVG:
- foreignObject: true, desc: true, title: true
+ # HTML:
+ address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
+ aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
+ blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
+ caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
+ details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
+ embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
+ footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
+ h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
+ header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
+ img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
+ listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
+ noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
+ ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
+ script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
+ style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
+ template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
+ thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
+ wbr:NS_HTML, xmp:NS_HTML,
+
+ # MathML:
+ mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
+ 'annotation-xml':NS_MATHML,
+
+ # SVG:
+ foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
}
formatting_elements = {
u: true
}
+el_is_special = (e) ->
+ return special_elements[e] is e.namespace
# decode_named_char_ref()
#
return null if decoded is txt
return g_dncr.cache[txt] = decoded
-parse_html = (txt) ->
+parse_html = (txt, parse_error_cb = null) ->
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
- open_tags = [] # stack of open elements
+ open_els = [] # stack of open elements
tree_state = null
tok_state = null
tok_cur_tag = null # partially parsed tag
flag_frameset_ok = null
flag_parsing = null
+ flag_foster_parenting = null
+ afe = [] # active formatting elements
parse_error = ->
- console.log "Parse error at character #{cur} of #{txt.length}"
+ if parse_error_cb?
+ parse_error_cb cur
+ else
+ console.log "Parse error at character #{cur} of #{txt.length}"
# the functions below impliment the Tree Contstruction algorithm
# But first... the helpers
template_tag_is_open = ->
- for t of open_tags
- if t[0] is TYPE_TAG and t[1] is 'template'
+ for t in open_els
+ if t.type is TYPE_TAG and t.name is 'template'
return true
return false
+ is_in_scope_x = (tag_name, scope) ->
+ for t in open_els
+ if t.name is tag_name
+ return true
+ if t.name of scope
+ return false
+ return false
+ is_in_scope_x_y = (tag_name, scope, scope2) ->
+ for t in open_els
+ if t.name is tag_name
+ return true
+ if t.name of scope
+ return false
+ if t.name of scope2
+ return false
+ return false
+ standard_scopers = { # FIXME these are supposed to be namespace specific
+ 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
+ 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
+ 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
+ 'foreignObject': true, 'desc': true, 'title'
+ }
+ button_scopers = button: true
+ li_scopers = ol: true, ul: true
+ table_scopers = html: true, table: true, template: true
is_in_scope = (tag_name) ->
- for t of open_tags
- if t[0] is TYPE_TAG and t[1] is tag_name
+ return is_in_scope_x tag_name, standard_scopers
+ is_in_button_scope = (tag_name) ->
+ return is_in_scope_x_y tag_name, standard_scopers, button_scopers
+ is_in_table_scope = (tag_name) ->
+ return is_in_scope_x tag_name, table_scopers
+ is_in_select_scope = (tag_name) ->
+ for t in open_els
+ if t.name is tag_name
return true
- # FIXME bail if in scopers
+ if t.name isnt 'optgroup' and t.name isnt 'option'
+ return false
+ return false
+ # this checks for a particular element, not by name
+ el_is_in_scope = (el) ->
+ for t in open_els
+ if t is el
+ return true
+ if t.name of standard_scopers
+ return false
return false
+ # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
+ # this implementation is structured (mostly) as described at the link above.
+ # capitalized comments are the "labels" described at the link above.
reconstruct_active_formatting_elements = ->
- # FIXME implement this
+ return if afe.length is 0
+ if afe[0].type is TYPE_MARKER or afe[0] in open_els
+ return
+ # Rewind
+ i = 0
+ loop
+ if i is afe.length - 1
+ break
+ i += 1
+ if afe[i].type is TYPE_MARKER or afe[i] in open_els
+ i -= 1 # Advance
+ break
+ # Create
+ loop
+ el = afe[i].shallow_clone()
+ tree_insert_element el
+ afe[i] = el
+ break if i is 0
+ i -= 1
+
+ # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
+ # adoption agency algorithm
+ adoption_agency = (subject) ->
+ if open_els[0].name is subject
+ el = open_els[0]
+ open_els.shift()
+ # remove it from the list of active formatting elements (if found)
+ for t, i in afe
+ if t is el
+ afe.splice i, 1
+ break
+ return
+ outer = 0
+ loop
+ if outer >= 8
+ return
+ outer += 1
+ fe = null
+ for t, fe_index in afe
+ if t.type is TYPE_MARKER
+ break
+ if t.name is subject
+ fe = t
+ break
+ if fe is null
+ in_body_any_other_end_tag subject
+ return
+ in_open_els = false
+ for t in open_els
+ if t is fe
+ in_open_els = true
+ break
+ unless in_open_els
+ parse_error()
+ # "remove it from the list" must mean afe, since it's not in open_els
+ afe.splice fe_index, 1
+ return
+ unless el_is_in_scope fe
+ parse_error()
+ return
+ unless open_els[0] is fe
+ parse_error()
+ # continue
+ fb = null
+ fb_index
+ for t, i in open_els
+ if t is fe
+ break
+ if el_is_special t
+ fb = t
+ fb_index = i
+ if fb is null
+ loop
+ t = open_els.shift()
+ if t is fe
+ afe.splice fe_index, 1
+ return
+ ca = open_els[fe_index + 1] # common ancestor
+ node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
+ # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
+ bookmark = new_aaa_bookmark()
+ for t, i in afe
+ if t is fe
+ afe.splice i, 0, bookmark
+ node = last_node = fb
+ inner = 0
+ loop
+ inner += 1
+ node_next = null
+ for t, i in open_els
+ if t is node
+ node_next = open_els[i + 1]
+ break
+ node = node_next ? node_above
+ # TODO make sure node_above gets re-set if/when node is removed from open_els
+ if node is fe
+ break
+ node_in_afe = false
+ for t, i of afe
+ if t is node
+ if inner > 3
+ afe.splice i, 1
+ else
+ node_in_afe = true
+ break
+ unless node_in_afe
+ for t, i in open_els
+ if t is node
+ node_above = open_els[i + 1]
+ open_els.splice i, 1
+ break
+ continue
+ # 7. reate an element for the token for which the element node
+ # was created, in the HTML namespace, with common ancestor as
+ # the intended parent; replace the entry for node in the list
+ # of active formatting elements with an entry for the new
+ # element, replace the entry for node in the stack of open
+ # elements with an entry for the new element, and let node be
+ # the new element.
+ new_node = node.shallow_clone()
+ for t, i in afe
+ if t is node
+ afe[i] = new_node
+ break
+ for t, i in open_els
+ if t is node
+ open_els[i] = new_node
+ break
+ node = new_node
+ # 8. If last node is furthest block, then move the
+ # aforementioned bookmark to be immediately after the new node
+ # in the list of active formatting elements.
+ if last_node is fb
+ for t, i in afe
+ if t is bookmark
+ afe.splice i, 1
+ for t, i in afe
+ if t is node
+ # TODO test: position i gets you "after"?
+ afe.splice i, 0, new_aaa_bookmark()
+ # 9. Insert last node into node, first removing it from its
+ # previous parent node if any.
+ if last_node.parent?
+ for c, i of last_node.parent.children
+ if c is last_node
+ last_node.parent.children.splice i, 1
+ node.children.push last_node
+ last_node.parent = node
+ # 10. Let last node be node.
+ last_node = node
+ # 11. Return to the step labeled inner loop.
+ # 14. Insert whatever last node ended up being in the previous step
+ # at the appropriate place for inserting a node, but using common
+ # ancestor as the override target.
+ tree_insert_element last_node, ca
+ # 15. Create an element for the token for which formatting element
+ # was created, in the HTML namespace, with furthest block as the
+ # intended parent.
+ new_element = fe.shallow_clone()
+ # 16. Take all of the child nodes of furthest block and append them
+ # to the element created in the last step.
+ while fb.children.length
+ t = fb.children.shift()
+ t.parent = new_element
+ new_element.children.push t
+ # 17. Append that new element to furthest block.
+ new_element.parent = fb
+ fb.children.push new_element
+ # 18. Remove formatting element from the list of active formatting
+ # elements, and insert the new element into the list of active
+ # formatting elements at the position of the aforementioned
+ # bookmark.
+ for t, i in afe
+ if t is fe
+ afe.splice i, 1
+ break
+ for t, i in afe
+ if t is bookmark
+ afe[i] = node
+ break
+ # 19. Remove formatting element from the stack of open elements,
+ # and insert the new element into the stack of open elements
+ # immediately below the position of furthest block in that stack.
+ for t, i of open_els
+ if t is fe
+ open_els.splice i, 1
+ break
+ for t, i of open_els
+ if t is fb
+ open_els.splice i, 0, new_element
+ break
+ # 20. Jump back to the step labeled outer loop.
# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
# FIXME implement this
close_p_if_in_button_scope = ->
- if open_tags[0][1] is 'p' # FIXME
- open_tags.pop()
+ if open_els[0].name is 'p'
+ open_els.pop()
return
#p = find_button_scope 'p'
#if p?
# TODO generate_implied_end_tags except for p tags
- # TODO parse_error unless open_tags[0][1] is 'p'
+ # TODO parse_error unless open_els[0].name is 'p'
# TODO pop stack until 'p' popped
+ # http://www.w3.org/TR/html5/syntax.html#insert-a-character
+ tree_insert_text = (t) ->
+ dest = adjusted_insertion_location()
+ if dest[1] > 0
+ prev = dest[0].children[dest[1] - 1]
+ if prev.type is TYPE_TEXT
+ prev.text += t.text
+ return
+ dest[0].children.splice dest[1], 0, t
+ # 8.2.5.1
+ # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
+ # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
+ adjusted_insertion_location = (override_target = null) ->
+ # 1. If there was an override target specified, then let target be the
+ # override target.
+ if override_target?
+ target = override_target
+ else # Otherwise, let target be the current node.
+ target = open_els[0]
+ # 2. Determine the adjusted insertion location using the first matching
+ # steps from the following list:
+ #
+ # If foster parenting is enabled and target is a table, tbody, tfoot,
+ # thead, or tr element Foster parenting happens when content is
+ # misnested in tables.
+ if flag_foster_parenting and target.name in foster_parenting_targets
+ console.log "foster parenting isn't implemented yet" # TODO
+ # 1. Let last template be the last template element in the stack of
+ # open elements, if any.
+ # 2. Let last table be the last table element in the stack of open
+ # elements, if any.
- # http://www.w3.org/TR/html5/syntax.html#insert-a-character
- tree_insert_a_character = (t) ->
- # FIXME read spec for "adjusted insertion location, etc, this might be wrong
- if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
- open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
+ # 3. If there is a last template and either there is no last table,
+ # or there is one, but last template is lower (more recently added)
+ # than last table in the stack of open elements, then: let adjusted
+ # insertion location be inside last template's template contents,
+ # after its last child (if any), and abort these substeps.
+
+ # 4. If there is no last table, then let adjusted insertion
+ # location be inside the first element in the stack of open
+ # elements (the html element), after its last child (if any), and
+ # abort these substeps. (fragment case)
+
+ # 5. If last table has a parent element, then let adjusted
+ # insertion location be inside last table's parent element,
+ # immediately before last table, and abort these substeps.
+
+ # 6. Let previous element be the element immediately above last
+ # table in the stack of open elements.
+
+ # 7. Let adjusted insertion location be inside previous element,
+ # after its last child (if any).
+
+ # Note: These steps are involved in part because it's possible for
+ # elements, the table element in this case in particular, to have
+ # been moved by a script around in the DOM, or indeed removed from
+ # the DOM entirely, after the element was inserted by the parser.
else
- open_tags[0][3].push t
+ # Otherwise Let adjusted insertion location be inside target, after
+ # its last child (if any).
+ target_i = target.children.length
- # FIXME read spec, do this right
- # note: this assumes it's an open tag
- tree_insert_tag = (t) ->
- t[0] = TYPE_TAG # not TYPE_OPEN_TAG
+ # 3. If the adjusted insertion location is inside a template element,
+ # let it instead be inside the template element's template contents,
+ # after its last child (if any). TODO
+
+ # 4. Return the adjusted insertion location.
+ return [target, target_i]
+
+ # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
+ # aka create_an_element_for_token
+ token_to_element = (t, namespace, intended_parent) ->
+ t.type = TYPE_TAG # not TYPE_OPEN_TAG
# convert attributes into a hash
attrs = {}
- while t[2].length
- a = t[2].pop()
- attrs[a[0]] = a[1]
- t[2] = attrs
- open_tags[0][3].push t
- open_tags.unshift t
+ while t.attrs_a.length
+ a = t.attrs_a.pop()
+ attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
+ el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
+
+ # TODO 2. If the newly created element has an xmlns attribute in the
+ # XMLNS namespace whose value is not exactly the same as the element's
+ # namespace, that is a parse error. Similarly, if the newly created
+ # element has an xmlns:xlink attribute in the XMLNS namespace whose
+ # value is not the XLink Namespace, that is a parse error.
+
+ # fixfull: the spec says stuff about form pointers and ownerDocument
+
+ return el
+
+ # FIXME read implement "foster parenting" part
+ # FIXME read spec, do this right
+ # FIXME implement the override target thing
+ # note: this assumes it's an open tag
+ # TODO tree_insert_html_element = (t, ...
+ tree_insert_element = (el, override_target = null, namespace = null) ->
+ dest = adjusted_insertion_location override_target
+ if el.type is TYPE_OPEN_TAG # means it's a "token"
+ el = token_to_element el, namespace, dest[0]
+ # fixfull: Document nodes sometimes can't accept more chidren
+ dest[0].children.splice dest[1], 0, el
+ el.parent = dest[0]
+ open_els.unshift el
+ return el
# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
tree_insert_a_comment = (t) ->
# FIXME read spec for "adjusted insertion location, etc, this might be wrong
- open_tags[0][3].push t
+ open_els[0].children.push t
# 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+ in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
+ for node, i in open_els
+ if node.name is name
+ # FIXME generate implied end tags except those with name==name
+ parse_error() unless i is 0
+ while i > 0
+ open_els.shift()
+ i -= 1
+ open_els.shift()
+ return
+ if special_elements[node.name]?
+ parse_error()
+ return
tree_in_body = (t) ->
- switch t[0]
+ switch t.type
when TYPE_TEXT
- switch t[1]
+ switch t.text
when "\u0000"
parse_error()
when "\t", "\u000a", "\u000c", "\u000d", ' '
reconstruct_active_formatting_elements()
- tree_insert_a_character t
+ tree_insert_text t
else
reconstruct_active_formatting_elements()
- tree_insert_a_character t
+ tree_insert_text t
flag_frameset_ok = false
when TYPE_COMMENT
tree_insert_a_comment t
when TYPE_DOCTYPE
parse_error()
when TYPE_OPEN_TAG
- switch t[1]
+ switch t.name
when 'html'
parse_error()
return if template_tag_is_open()
- root_attrs = open_tags[open_tags.length - 1][3]
- for k, v of t[2]
+ root_attrs = open_els[open_els.length - 1].children
+ for k, v of t.attrs
root_attrs[k] = v unless root_attrs[k]?
when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
# FIXME also do this for </template> (end tag)
# TODO
when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
close_p_if_in_button_scope()
- tree_insert_tag t
+ tree_insert_element t
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
close_p_if_in_button_scope()
- if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+ if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
parse_error()
- open_tags.shift()
- tree_insert_tag t
+ open_els.shift()
+ tree_insert_element t
+ # TODO lots more to implement here
+ when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
+ reconstruct_active_formatting_elements()
+ el = tree_insert_element t
+ afe.push el
# TODO lots more to implement here
else # any other start tag
reconstruct_active_formatting_elements()
- tree_insert_tag t
+ tree_insert_element t
when TYPE_EOF
ok_tags = {
dd: true, dt: true, li: true, p: true, tbody: true, td: true,
tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
}
- for t in open_tags
- unless ok_tags[t[1]]?
+ for t in open_els
+ unless ok_tags[t.name]?
parse_error()
break
# TODO stack of template insertion modes thing
flag_parsing = false # stop parsing
when TYPE_END_TAG
- switch t[1]
+ switch t.name
when 'body'
unless is_in_scope 'body'
parse_error()
return
# TODO implement parse error and move to tree_after_body, reprocess
# TODO lots more close tags to implement here
+ when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
+ adoption_agency t.name
+ # TODO lots more close tags to implement here
else
- for node, i in open_tags
- if node[1] is t[1]
- # FIXME generate implied end tags except those with name==t[1]
- parse_error() unless i is 0
- while i > 0
- open_tags.shift()
- i -= 1
- open_tags.shift()
- return
- if special_elements[node[1]]?
- parse_error()
- return
+ in_body_any_other_end_tag t.name
+ return
# the functions below implement the tokenizer stats described here:
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
- return [TYPE_TEXT, tokenize_character_reference()]
+ return new_text_node tokenize_character_reference()
when '<'
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
- return [TYPE_TEXT, c]
+ return new_text_node c
when '' # EOF
- return [TYPE_EOF]
+ return new_eof_token()
else
- return [TYPE_TEXT, c]
+ return new_text_node c
return null
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+ tok_cur_tag = new_open_tag c
tok_state = tok_state_tag_name
else if uc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+ tok_cur_tag = new_open_tag c.toLowerCase()
tok_state = tok_state_tag_name
else
parse_error()
tok_state = tok_state_data
cur -= 1 # we didn't parse/handle the char after <
- return [TYPE_TEXT, '<']
+ return new_text_node '<'
return null
# 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
when '' # EOF
parse_error()
tok_state = tok_state_data
- return [TYPE_TEXT, '</']
+ return new_text_node '</'
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
+ tok_cur_tag = new_end_tag c.toLowerCase()
tok_state = tok_state_tag_name
else if lc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_END_TAG, c, [], []]
+ tok_cur_tag = new_end_tag c
tok_state = tok_state_tag_name
else
parse_error()
return tmp
when "\u0000"
parse_error()
- tok_cur_tag[1] += "\ufffd"
+ tok_cur_tag.name += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag[1] += c.toLowerCase()
+ tok_cur_tag.name += c.toLowerCase()
else
- tok_cur_tag[1] += c
+ tok_cur_tag.name += c
return null
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
else
attr_name = c
if attr_name?
- tok_cur_tag[2].unshift [attr_name, '']
+ tok_cur_tag.attrs_a.unshift [attr_name, '']
tok_state = tok_state_attribute_name
return null
return tmp
when "\u0000"
parse_error()
- tok_cur_tag[2][0][0] = "\ufffd"
+ tok_cur_tag.attrs_a[0][0] = "\ufffd"
when '"', "'", '<'
parse_error()
- tok_cur_tag[2][0][0] = c
+ tok_cur_tag.attrs_a[0][0] = c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag[2][0][0] = c.toLowerCase()
+ tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
else
- tok_cur_tag[2][0][0] += c
+ tok_cur_tag.attrs_a[0][0] += c
return null
# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
tok_state = tok_state_attribute_value_single_quoted
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
tok_state = tok_state_attribute_value_unquoted
when '>'
# Parse error
parse_error()
tok_state = tok_state_data
else
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
tok_state = tok_state_attribute_value_unquoted
return null
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
when "'"
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '&'
- tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
# Parse Error if ', <, = or ` (backtick)
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
- tree = [TYPE_TAG, 'html', {}, []]
- open_tags = [tree]
+ tree = new Node TYPE_TAG, name: 'html'
+ open_els = [tree]
tree_state = tree_in_body
flag_frameset_ok = true
flag_parsing = true
+ flag_foster_parenting = false
+ afe = [] # active formatting elements
# tokenizer initialization
tok_state = tok_state_data
t = tok_state()
if t?
tree_state t
- return tree[3]
+ return tree.children
# everything below is tests on the above
-test_equals = (description, fn, args..., expected_output) ->
- output = fn.apply this, args
+test_equals = (description, output, expected_output) ->
if output is expected_output
- console.log "passed: #{description}."
+ console.log "passed." # don't say name, so smart consoles can merge all of these
else
- console.log "FAILED: #{description}..."
+ console.log "FAILED: \"#{description}\""
console.log " Expected: #{expected_output}"
console.log " Actual: #{output}"
-html_to_json = (html) ->
- return JSON.stringify parse_html html
-test_equals "empty", html_to_json, "", '[]'
-test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
-test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
-test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
-test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
-test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
-test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
-test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
-test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
-test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
-test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
-test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'
-test_equals "mis-matched closing tags", html_to_json, "foo<div>bar<span>baz</div>qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'
+test_parser = (args) ->
+ parse_errors = []
+ errors_cb = (i) ->
+ parse_errors.push i
+ parsed = parse_html args.html, errors_cb
+ serialized = ''
+ sep = ''
+ for t in parsed
+ serialized += sep
+ sep = ','
+ serialized += t.serialize()
+ if serialized isnt args.expected or parse_errors.length isnt args.errors
+ console.log "FAILED: \"#{args.name}\""
+ else
+ console.log "passed \"#{args.name}\""
+ if serialized isnt args.expected
+ console.log " Input: #{args.html}"
+ console.log " Correct: #{args.expected}"
+ console.log " Output: #{serialized}"
+ if parse_errors.length isnt args.errors
+ console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+
+test_parser name: "empty", \
+ html: "",
+ expected: '',
+ errors: 0
+test_parser name: "just text", \
+ html: "abc",
+ expected: 'text:"abc"',
+ errors: 0
+test_parser name: "named entity", \
+ html: "a&1234",
+ expected: 'text:"a&1234"',
+ errors: 0
+test_parser name: "broken named character references", \
+ html: "1&2&&3&aabbcc;",
+ expected: 'text:"1&2&&3&aabbcc;"',
+ errors: 2
+test_parser name: "numbered entity overrides", \
+ html: "1€€ ƒ",
+ expected: 'text:"1€€ ƒ"',
+ errors: 0
+test_parser name: "open tag", \
+ html: "foo<span>bar",
+ expected: 'text:"foo",tag:"span",{},[text:"bar"]',
+ errors: 1 # no close tag
+test_parser name: "open tag with attributes", \
+ html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
+ expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
+ errors: 1 # no close tag
+test_parser name: "open tag with attributes of various quotings", \
+ html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
+ expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
+ errors: 1 # no close tag
+test_parser name: "attribute entity exceptions dq", \
+ html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
+ errors: 2 # no close tag, &= in attr
+test_parser name: "attribute entity exceptions sq", \
+ html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
+ errors: 2 # no close tag, &= in attr
+test_parser name: "attribute entity exceptions uq", \
+ html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
+ errors: 2 # no close tag, &= in attr
+test_parser name: "matching closing tags", \
+ html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
+ expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
+ errors: 0
+test_parser name: "missing closing tag inside", \
+ html: "foo<div>bar<span>baz</div>qux",
+ expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
+ errors: 1 # close tag mismatch
+test_parser name: "mis-matched closing tags", \
+ html: "<span>12<div>34</span>56</div>78",
+ expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
+ errors: 2 # misplaced </span>, no </span> at the end
+test_parser name: "mis-matched formatting elements", \
+ html: "12<b>34<i>56</b>78</i>90",
+ expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
+ errors: 1 # no idea how many their should be
+test_parser name: "crazy formatting elements test", \
+ html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
+ # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
+ # firefox does this:
+ expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
+ errors: 6 # no idea how many there should be
+# tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
+test_parser name: "html5lib aaa 1", \
+ html: '<a><p></a></p>',
+ expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
+ errors: 2
+test_parser name: "html5lib aaa 2", \
+ html: '<a>1<p>2</a>3</p>',
+ expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
+ errors: 2
+test_parser name: "html5lib aaa 3", \
+ html: '<a>1<button>2</a>3</button>',
+ expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
+ errors: 2
+test_parser name: "html5lib aaa 4", \
+ html: '<a>1<b>2</a>3</b>',
+ expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
+ errors: 2
+test_parser name: "html5lib aaa 5", \
+ html: '<a>1<div>2<div>3</a>4</div>5</div>',
+ expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"]text:"5"]',
+ errors: 3