#
# Instead, the data structure produced by this parser is an array of nodes.
#
-# Each node is an array. The first element in the array is an integer (one of
-# the TYPE_* constants below) followed by the appropriate fields for that type
-# (shown below in the comments after the TYPE_* definition.)
-
+# Each node is an obect of the Node class. Here are the Node types:
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
-TYPE_WHITESPACE = 2
-TYPE_COMMENT = 3
+TYPE_COMMENT = 2
+TYPE_DOCTYPE = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
+TYPE_END_TAG = 5 # name
+TYPE_EOF = 6
+
+class Node
+ constructor: (type, args = {}) ->
+ @type = type # one of the TYPE_* constants above
+ @name = args.name ? '' # tag name
+ @text = args.text ? '' # contents for text/comment nodes
+ @attrs = args.attrs ? {}
+ @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
+ @children = args.children ? []
+ serialize: -> # for unit tests
+ ret = ''
+ switch @type
+ when TYPE_TAG
+ ret += 'tag:'
+ ret += JSON.stringify @name
+ ret += ','
+ ret += JSON.stringify @attrs
+ ret += ','
+ sep = '['
+ for c in @children
+ ret += sep
+ sep = ','
+ ret += c.serialize()
+ ret += ']'
+ when TYPE_TEXT
+ ret += 'text:'
+ ret += JSON.stringify @text
+ when TYPE_COMMENT
+ ret += 'comment:'
+ ret += JSON.stringify @text
+ when TYPE_DOCTYPE
+ ret += 'doctype'
+ # FIXME
+ else
+ ret += 'unknown:'
+ return ret
+
+
+# helpers: (only take args that are normally known when parser creates nodes)
+new_open_tag = (name) ->
+ return new Node TYPE_OPEN_TAG, name: name
+new_end_tag = (name) ->
+ return new Node TYPE_END_TAG, name: name
+new_text_node = (txt) ->
+ return new Node TYPE_TEXT, text: txt
+new_comment_node = (txt) ->
+ return new Node TYPE_COMMENT, text: txt
+new_eof_token = ->
+ return new Node TYPE_EOF
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
# foreign_elements = [svg_elements..., mathml_elements...]
#normal_elements = All other allowed HTML elements are normal elements.
+special_elements = {
+ # from HTML:
+ address: true, applet: true, area: true, article: true, aside: true,
+ base: true, basefont: true, bgsound: true, blockquote: true, body: true,
+ br: true, button: true, caption: true, center: true, col: true,
+ colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
+ dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
+ footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
+ h3: true, h4: true, h5: true, h6: true, head: true, header: true,
+ hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
+ isindex: true, li: true, link: true, listing: true, main: true,
+ marquee: true, meta: true, nav: true, noembed: true, noframes: true,
+ noscript: true, object: true, ol: true, p: true, param: true,
+ plaintext: true, pre: true, script: true, section: true, select: true,
+ source: true, style: true, summary: true, table: true, tbody: true,
+ td: true, template: true, textarea: true, tfoot: true, th: true,
+ thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
+ xmp: true,
+
+ # from MathML:
+ mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
+
+ # from SVG:
+ foreignObject: true, desc: true, title: true
+}
+
+formatting_elements = {
+ a: true, b: true, big: true, code: true, em: true, font: true, i: true,
+ nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
+ u: true
+}
+
# decode_named_char_ref()
#
return null if decoded is txt
return g_dncr.cache[txt] = decoded
-parse_html = (txt) ->
+parse_html = (txt, parse_error_cb = null) ->
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
- tree_append_point = null
+ open_tags = [] # stack of open elements
tree_state = null
tok_state = null
tok_cur_tag = null # partially parsed tag
+ flag_frameset_ok = null
+ flag_parsing = null
+
+ parse_error = ->
+ if parse_error_cb?
+ parse_error_cb cur
+ else
+ console.log "Parse error at character #{cur} of #{txt.length}"
+
+
+ # the functions below impliment the Tree Contstruction algorithm
+ # http://www.w3.org/TR/html5/syntax.html#tree-construction
+
+ # But first... the helpers
+ template_tag_is_open = ->
+ for t in open_tags
+ if t.type is TYPE_TAG and t.name is 'template'
+ return true
+ return false
+ is_in_scope_x = (tag_name, scope) ->
+ for t in open_tags
+ if t.name is tag_name
+ return true
+ if t.name of scope
+ return false
+ return false
+ is_in_scope_x_y = (tag_name, scope, scope2) ->
+ for t in open_tags
+ if t.name is tag_name
+ return true
+ if t.name of scope
+ return false
+ if t.name of scope2
+ return false
+ return false
+ standard_scopers = { # FIXME these are supposed to be namespace specific
+ 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
+ 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
+ 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
+ 'foreignObject': true, 'desc': true, 'title'
+ }
+ button_scopers = button: true
+ li_scopers = ol: true, ul: true
+ table_scopers = html: true, table: true, template: true
+ is_in_scope = (tag_name) ->
+ return is_in_scope_x tag_name, standard_scopers
+ is_in_button_scope = (tag_name) ->
+ return is_in_scope_x_y tag_name, standard_scopers, button_scopers
+ is_in_table_scope = (tag_name) ->
+ return is_in_scope_x tag_name, table_scopers
+ is_in_select_scope = (tag_name) ->
+ for t in open_tags
+ if t.name is tag_name
+ return true
+ if t.name isnt 'optgroup' and t.name isnt 'option'
+ return false
+ return false
+
+ reconstruct_active_formatting_elements = ->
+ # FIXME implement this
+
+ # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
+ # FIXME implement this
+ close_p_if_in_button_scope = ->
+ if open_tags[0].name is 'p'
+ open_tags.pop()
+ return
+ #p = find_button_scope 'p'
+ #if p?
+ # TODO generate_implied_end_tags except for p tags
+ # TODO parse_error unless open_tags[0].name is 'p'
+ # TODO pop stack until 'p' popped
+
+
+
+ # http://www.w3.org/TR/html5/syntax.html#insert-a-character
+ tree_insert_a_character = (t) ->
+ # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+ dest = open_tags[0].children
+ if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
+ dest[dest.length - 1].text += t.text
+ else
+ dest.push t
+
+ # FIXME read spec, do this right
+ # note: this assumes it's an open tag
+ tree_insert_tag = (t) ->
+ t.type = TYPE_TAG # not TYPE_OPEN_TAG
+ # convert attributes into a hash
+ while t.attrs_a.length
+ a = t.attrs_a.pop()
+ t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
+ open_tags[0].children.push t
+ open_tags.unshift t
+
+ # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
+ tree_insert_a_comment = (t) ->
+ # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+ open_tags[0].children.push t
+
+ # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+ tree_in_body = (t) ->
+ switch t.type
+ when TYPE_TEXT
+ switch t.text
+ when "\u0000"
+ parse_error()
+ when "\t", "\u000a", "\u000c", "\u000d", ' '
+ reconstruct_active_formatting_elements()
+ tree_insert_a_character t
+ else
+ reconstruct_active_formatting_elements()
+ tree_insert_a_character t
+ flag_frameset_ok = false
+ when TYPE_COMMENT
+ tree_insert_a_comment t
+ when TYPE_DOCTYPE
+ parse_error()
+ when TYPE_OPEN_TAG
+ switch t.name
+ when 'html'
+ parse_error()
+ return if template_tag_is_open()
+ root_attrs = open_tags[open_tags.length - 1].children
+ for k, v of t.attrs
+ root_attrs[k] = v unless root_attrs[k]?
+ when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
+ # FIXME also do this for </template> (end tag)
+ return tree_in_head t
+ when 'body'
+ parse_error()
+ # TODO
+ when 'frameset'
+ parse_error()
+ # TODO
+ when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
+ close_p_if_in_button_scope()
+ tree_insert_tag t
+ when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+ close_p_if_in_button_scope()
+ if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+ parse_error()
+ open_tags.shift()
+ tree_insert_tag t
+ # TODO lots more to implement here
+ else # any other start tag
+ reconstruct_active_formatting_elements()
+ tree_insert_tag t
+ when TYPE_EOF
+ ok_tags = {
+ dd: true, dt: true, li: true, p: true, tbody: true, td: true,
+ tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
+ }
+ for t in open_tags
+ unless ok_tags[t.name]?
+ parse_error()
+ break
+ # TODO stack of template insertion modes thing
+ flag_parsing = false # stop parsing
+ when TYPE_END_TAG
+ switch t.name
+ when 'body'
+ unless is_in_scope 'body'
+ parse_error()
+ return
+ # TODO implement parse error and move to tree_after_body
+ when 'html'
+ unless is_in_scope 'body' # weird, but it's what the spec says
+ parse_error()
+ return
+ # TODO implement parse error and move to tree_after_body, reprocess
+ # TODO lots more close tags to implement here
+ else
+ for node, i in open_tags
+ if node.name is t.name
+ # FIXME generate implied end tags except those with name==t.name
+ parse_error() unless i is 0
+ while i > 0
+ open_tags.shift()
+ i -= 1
+ open_tags.shift()
+ return
+ if special_elements[node.name]?
+ parse_error()
+ return
# the functions below implement the tokenizer stats described here:
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
- tok_state = tok_state_character_reference_in_data
+ return new_text_node tokenize_character_reference()
when '<'
tok_state = tok_state_tag_open
when "\u0000"
- # Parse error
- return [TYPE_TEXT, c]
+ parse_error()
+ return new_text_node c
+ when '' # EOF
+ return new_eof_token()
else
- return [TYPE_TEXT, c]
+ return new_text_node c
return null
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
- # & just got consumed
- tok_state_character_reference_in_data = ->
- tok_state = tok_state_data
- if cur >= txt.length
- return [TYPE_TEXT, '&']
- switch c = txt.charAt(cur)
- when ';'
- return [TYPE_TEXT, '&']
- when '#'
- if cur + 1 >= txt.length
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + 1).toLowerCase() is 'x'
- prefix = '#x'
- charset = hex_chars
- start = cur + 2
- else
- charset = digits
- start = cur + 1
- prefix = '#'
- i = 0
- while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
- i += 1
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(start + i) is ';'
- i += 1
- decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
- if decoded?
- cur = start + i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- for i in [0...31]
- if alnum.indexOf(txt.charAt(cur + i)) is -1
- break
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + i) is ';'
- i += 1 # include ';' terminator in value
- decoded = decode_named_char_ref txt.substr(cur, i)
- if decoded?
- cur += i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- # no ';' terminator (only legacy char refs)
- if i < 2 or i > 6
- return [TYPE_TEXT, '&']
- # FIXME: if we're inside an attribute:
- # 1. don't parse refs that are followed by =
- # 2. don't parse refs that are followed by alnum
- max = i
- for i in [2..max] # no prefix matches, so ok to check shortest first
- c = legacy_char_refs[txt.substr(cur, i)]
- if c?
- cur += i # consume entity chars
- return [TYPE_TEXT, c]
- return null
+ # not needed: tok_state_character_reference_in_data = ->
+ # just call tok_state_character_reference_in_data()
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
when '/'
tok_state = tok_state_end_tag_open
when '?'
- # Parse error
+ parse_error()
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+ tok_cur_tag = new_open_tag c
tok_state = tok_state_tag_name
else if uc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+ tok_cur_tag = new_open_tag c.toLowerCase()
tok_state = tok_state_tag_name
else
- # Parse error
+ parse_error()
tok_state = tok_state_data
cur -= 1 # we didn't parse/handle the char after <
- return [TYPE_TEXT, '<']
+ return new_text_node '<'
+ return null
+
+ # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+ tok_state_end_tag_open = ->
+ switch c = txt.charAt(cur++)
+ when '>'
+ parse_error()
+ tok_state = tok_state_data
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ return new_text_node '</'
+ else
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ tok_state = tok_state_tag_name
+ else if lc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c
+ tok_state = tok_state_tag_name
+ else
+ parse_error()
+ tok_state = tok_state_bogus_comment
return null
# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
tok_cur_tag = null
return tmp
when "\u0000"
- # Parse error
- tok_cur_tag[1] += "\ufffd"
+ parse_error()
+ tok_cur_tag.name += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag[1] += c.toLowerCase()
+ tok_cur_tag.name += c.toLowerCase()
else
- tok_cur_tag[1] += c
+ tok_cur_tag.name += c
return null
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
tok_cur_tag = null
return tmp
when "\u0000"
- # Parse error
+ parse_error()
attr_name = "\ufffd"
when '"', "'", '<', '='
- # Parse error
+ parse_error()
attr_name = c
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
attr_name = c.toLowerCase()
else
attr_name = c
if attr_name?
- tok_cur_tag[2].unshift [attr_name, '']
+ tok_cur_tag.attrs_a.unshift [attr_name, '']
tok_state = tok_state_attribute_name
return null
tok_cur_tag = null
return tmp
when "\u0000"
- # Parse error
- tok_cur_tag[2][0][0] += "\ufffd"
+ parse_error()
+ tok_cur_tag.attrs_a[0][0] = "\ufffd"
+ when '"', "'", '<'
+ parse_error()
+ tok_cur_tag.attrs_a[0][0] = c
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag[2][0][0] += c.toLowerCase()
+ tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
else
- # Parse error if ", ' or <
- tok_cur_tag[2][0][0] += c
+ tok_cur_tag.attrs_a[0][0] += c
return null
# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
tok_state = tok_state_attribute_value_single_quoted
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
tok_state = tok_state_attribute_value_unquoted
when '>'
# Parse error
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
- if uc_alpha.indexOf(c) > -1
- tok_cur_tag[2][0][1] += c.toLowerCase()
- else
- # Parse error if ", ` or < (that's a backtick)
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
+ tok_state = tok_state_attribute_value_unquoted
return null
# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_state = tok_state_character_reference_in_attribute_value
- tok_char_ref_addl_allowed = '"' # FIXME
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
- tok_state = tok_state_attribute_value_unquoted
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
+ return null
+
+ # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+ tok_state_attribute_value_single_quoted = ->
+ switch c = txt.charAt(cur++)
+ when "'"
+ tok_state = tok_state_after_attribute_value_quoted
+ when '&'
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
+ when "\u0000"
+ # Parse error
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ else
+ tok_cur_tag.attrs_a[0][1] += c
+ return null
+
+ # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+ tok_state_attribute_value_unquoted = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\n", "\u000c", ' '
+ tok_state = tok_state_before_attribute_name
+ when '&'
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
+ when '>'
+ tok_state = tok_state_data
+ tmp = tok_cur_tag
+ tok_cur_tag = null
+ return tmp
+ when "\u0000"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ else
+ # Parse Error if ', <, = or ` (backtick)
+ tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
# Parse Error
tok_state = tok_state_before_attribute_name
cur -= 1 # we didn't handle that char
return null
- # the functions below impliment the Tree Contstruction algorithm here:
- # http://www.w3.org/TR/html5/syntax.html#tree-construction
- # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
- tree_append = (t) ->
- if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
- tree_append_point[tree_append_point.length - 1][1] += t[1]
- else
- tree_append_point.push t
- if t[0] is TYPE_OPEN_TAG
- t[0] = TYPE_TAG
- attrs = {}
- while t[2].length
- a = t[2].pop()
- attrs[a[0]] = a[1]
- t[2] = attrs
- tree_append_point = t[3]
+ # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+ # Don't set this as a state, just call it
+ # returns a string (NOT a text node)
+ tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+ if cur >= txt.length
+ return '&'
+ switch c = txt.charAt(cur)
+ when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
+ # explicitly not a parse error
+ return '&'
+ when ';'
+ # there has to be "one or more" alnums between & and ; to be a parse error
+ return '&'
+ when '#'
+ if cur + 1 >= txt.length
+ return '&'
+ if txt.charAt(cur + 1).toLowerCase() is 'x'
+ prefix = '#x'
+ charset = hex_chars
+ start = cur + 2
+ else
+ charset = digits
+ start = cur + 1
+ prefix = '#'
+ i = 0
+ while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+ i += 1
+ if i is 0
+ return '&'
+ if txt.charAt(start + i) is ';'
+ i += 1
+ # FIXME This is supposed to generate parse errors for some chars
+ decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+ if decoded?
+ cur = start + i
+ return decoded
+ return '&'
+ else
+ for i in [0...31]
+ if alnum.indexOf(txt.charAt(cur + i)) is -1
+ break
+ if i is 0
+ # exit early, because parse_error() below needs at least one alnum
+ return '&'
+ if txt.charAt(cur + i) is ';'
+ i += 1 # include ';' terminator in value
+ decoded = decode_named_char_ref txt.substr(cur, i)
+ if decoded?
+ cur += i
+ return decoded
+ parse_error()
+ return '&'
+ else
+ # no ';' terminator (only legacy char refs)
+ max = i
+ for i in [2..max] # no prefix matches, so ok to check shortest first
+ c = legacy_char_refs[txt.substr(cur, i)]
+ if c?
+ if in_attr
+ if txt.charAt(cur + i) is '='
+ # "because some legacy user agents will
+ # misinterpret the markup in those cases"
+ parse_error()
+ return '&'
+ if alnum.indexOf(txt.charAt(cur + i)) > -1
+ # this makes attributes forgiving about url args
+ return '&'
+ # ok, and besides the weird exceptions for attributes...
+ # return the matching char
+ cur += i # consume entity chars
+ parse_error() # because no terminating ";"
+ return c
+ parse_error()
+ return '&'
+ return # never reached
# tree constructor initialization
- tree = [] # see comments on TYPE_TAG/etc for the structure of this data
- tree_append_point = tree
- tree_state = tree_append
+ # see comments on TYPE_TAG/etc for the structure of this data
+ tree = new Node TYPE_TAG, name: 'html'
+ open_tags = [tree]
+ tree_state = tree_in_body
+ flag_frameset_ok = true
+ flag_parsing = true
# tokenizer initialization
tok_state = tok_state_data
# proccess input
- while cur < txt.length
+ while flag_parsing
t = tok_state()
if t?
tree_state t
-
- return tree
+ return tree.children
# everything below is tests on the above
-test_equals = (description, fn, args..., expected_output) ->
- output = fn.apply this, args
+test_equals = (description, output, expected_output) ->
if output is expected_output
- console.log "passed: #{description}."
+ console.log "passed." # don't say name, so smart consoles can merge all of these
else
- console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
-html_to_json = (html) ->
- return JSON.stringify parse_html html
-test_equals "empty", html_to_json, "", '[]'
-test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
-test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
-test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
-test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
-test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
-test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]'
+ console.log "FAILED: \"#{description}\""
+ console.log " Expected: #{expected_output}"
+ console.log " Actual: #{output}"
+test_parser = (args) ->
+ parse_errors = []
+ errors_cb = (i) ->
+ parse_errors.push i
+ parsed = parse_html args.html, errors_cb
+ serialized = ''
+ sep = ''
+ for t in parsed
+ serialized += sep
+ sep = ','
+ serialized += t.serialize()
+ if serialized isnt args.expected or parse_errors.length isnt args.errors
+ console.log "FAILED: \"#{args.name}\""
+ else
+ console.log "passed \"#{args.name}\""
+ if serialized isnt args.expected
+ console.log " Input: #{args.html}"
+ console.log " Correct: #{args.expected}"
+ console.log " Output: #{serialized}"
+ if parse_errors.length isnt args.errors
+ console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+
+test_parser name: "empty", \
+ html: "",
+ expected: '',
+ errors: 0
+test_parser name: "just text", \
+ html: "abc",
+ expected: 'text:"abc"',
+ errors: 0
+test_parser name: "named entity", \
+ html: "a&1234",
+ expected: 'text:"a&1234"',
+ errors: 0
+test_parser name: "broken named character references", \
+ html: "1&2&&3&aabbcc;",
+ expected: 'text:"1&2&&3&aabbcc;"',
+ errors: 2
+test_parser name: "numbered entity overrides", \
+ html: "1€€ ƒ",
+ expected: 'text:"1€€ ƒ"',
+ errors: 0
+test_parser name: "open tag", \
+ html: "foo<span>bar",
+ expected: 'text:"foo",tag:"span",{},[text:"bar"]',
+ errors: 1 # no close tag
+test_parser name: "open tag with attributes", \
+ html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
+ expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
+ errors: 1 # no close tag
+test_parser name: "open tag with attributes of various quotings", \
+ html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
+ expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
+ errors: 1 # no close tag
+test_parser name: "attribute entity exceptions dq", \
+ html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
+ errors: 2 # no close tag, &= in attr
+test_parser name: "attribute entity exceptions sq", \
+ html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
+ errors: 2 # no close tag, &= in attr
+test_parser name: "attribute entity exceptions uq", \
+ html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
+ errors: 2 # no close tag, &= in attr
+test_parser name: "matching closing tags", \
+ html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
+ expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
+ errors: 0
+test_parser name: "missing closing tag inside", \
+ html: "foo<div>bar<span>baz</div>qux",
+ expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
+ errors: 1 # close tag mismatch
+test_parser name: "mis-matched closing tags", \
+ html: "<span>12<div>34</span>56</div>78",
+ expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
+ errors: 2 # misplaced </span>, no </span> at the end
+test_parser name: "mis-matched formatting elements", \
+ html: "12<b>34<i>56</b>78</i>90",
+ expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
+ errors: 2 # FIXME dunno how many there should be