#
# Instead, the data structure produced by this parser is an array of nodes.
#
-# Each node is an array. The first element in the array is an integer (one of
-# the TYPE_* constants below) followed by the appropriate fields for that type
-# (shown below in the comments after the TYPE_* definition.)
-
+# Each node is an obect of the Node class. Here are the Node types:
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
TYPE_COMMENT = 2
TYPE_END_TAG = 5 # name
TYPE_EOF = 6
+class Node
+ constructor: (type, args = {}) ->
+ @type = type # one of the TYPE_* constants above
+ @name = args.name ? '' # tag name
+ @text = args.text ? '' # contents for text/comment nodes
+ @attrs = args.attrs ? {}
+ @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
+ @children = args.children ? []
+ serialize: -> # for unit tests
+ ret = ''
+ switch @type
+ when TYPE_TAG
+ ret += 'tag:'
+ ret += JSON.stringify @name
+ ret += ','
+ ret += JSON.stringify @attrs
+ ret += ','
+ sep = '['
+ for c in @children
+ ret += sep
+ sep = ','
+ ret += c.serialize()
+ ret += ']'
+ when TYPE_TEXT
+ ret += 'text:'
+ ret += JSON.stringify @text
+ when TYPE_COMMENT
+ ret += 'comment:'
+ ret += JSON.stringify @text
+ when TYPE_DOCTYPE
+ ret += 'doctype'
+ # FIXME
+ else
+ ret += 'unknown:'
+ return ret
+
+
+# helpers: (only take args that are normally known when parser creates nodes)
+new_open_tag = (name) ->
+ return new Node TYPE_OPEN_TAG, name: name
+new_end_tag = (name) ->
+ return new Node TYPE_END_TAG, name: name
+new_text_node = (txt) ->
+ return new Node TYPE_TEXT, text: txt
+new_comment_node = (txt) ->
+ return new Node TYPE_COMMENT, text: txt
+new_eof_token = ->
+ return new Node TYPE_EOF
+
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
digits = "0123456789"
alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
scopers = { # FIXME these are supposed to be namespace specific
- 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
- 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
- 'foreignObject', 'desc', 'title'
+ 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
+ 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
+ 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
+ 'foreignObject': true, 'desc': true, 'title'
}
# some SVG elements have dashes in them
# But first... the helpers
template_tag_is_open = ->
for t of open_tags
- if t[0] is TYPE_TAG and t[1] is 'template'
+ if t.type is TYPE_TAG and t.name is 'template'
return true
return false
is_in_scope = (tag_name) ->
for t of open_tags
- if t[0] is TYPE_TAG and t[1] is tag_name
+ if t.name is tag_name
return true
- # FIXME bail if in scopers
+ if t.name of scopers
+ return false
return false
reconstruct_active_formatting_elements = ->
# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
# FIXME implement this
close_p_if_in_button_scope = ->
- if open_tags[0][1] is 'p' # FIXME
+ if open_tags[0].name is 'p'
open_tags.pop()
return
#p = find_button_scope 'p'
#if p?
# TODO generate_implied_end_tags except for p tags
- # TODO parse_error unless open_tags[0][1] is 'p'
+ # TODO parse_error unless open_tags[0].name is 'p'
# TODO pop stack until 'p' popped
# http://www.w3.org/TR/html5/syntax.html#insert-a-character
tree_insert_a_character = (t) ->
# FIXME read spec for "adjusted insertion location, etc, this might be wrong
- if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
- open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
+ dest = open_tags[0].children
+ if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
+ dest[dest.length - 1].text += t.text
else
- open_tags[0][3].push t
+ dest.push t
# FIXME read spec, do this right
# note: this assumes it's an open tag
tree_insert_tag = (t) ->
- t[0] = TYPE_TAG # not TYPE_OPEN_TAG
+ t.type = TYPE_TAG # not TYPE_OPEN_TAG
# convert attributes into a hash
- attrs = {}
- while t[2].length
- a = t[2].pop()
- attrs[a[0]] = a[1]
- t[2] = attrs
- open_tags[0][3].push t
+ while t.attrs_a.length
+ a = t.attrs_a.pop()
+ t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
+ open_tags[0].children.push t
open_tags.unshift t
# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
tree_insert_a_comment = (t) ->
# FIXME read spec for "adjusted insertion location, etc, this might be wrong
- open_tags[0][3].push t
+ open_tags[0].children.push t
# 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
tree_in_body = (t) ->
- switch t[0]
+ switch t.type
when TYPE_TEXT
- switch t[1]
+ switch t.text
when "\u0000"
parse_error()
when "\t", "\u000a", "\u000c", "\u000d", ' '
when TYPE_DOCTYPE
parse_error()
when TYPE_OPEN_TAG
- switch t[1]
+ switch t.name
when 'html'
parse_error()
return if template_tag_is_open()
- root_attrs = open_tags[open_tags.length - 1][3]
- for k, v of t[2]
+ root_attrs = open_tags[open_tags.length - 1].children
+ for k, v of t.attrs
root_attrs[k] = v unless root_attrs[k]?
when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
# FIXME also do this for </template> (end tag)
tree_insert_tag t
when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
close_p_if_in_button_scope()
- if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+ if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
parse_error()
open_tags.shift()
tree_insert_tag t
tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
}
for t in open_tags
- unless ok_tags[t[1]]?
+ unless ok_tags[t.name]?
parse_error()
break
# TODO stack of template insertion modes thing
flag_parsing = false # stop parsing
when TYPE_END_TAG
- switch t[1]
+ switch t.name
when 'body'
unless is_in_scope 'body'
parse_error()
# TODO lots more close tags to implement here
else
for node, i in open_tags
- if node[1] is t[1]
- # FIXME generate implied end tags except those with name==t[1]
+ if node.name is t.name
+ # FIXME generate implied end tags except those with name==t.name
parse_error() unless i is 0
while i > 0
open_tags.shift()
i -= 1
open_tags.shift()
return
- if special_elements[node[1]]?
+ if special_elements[node.name]?
parse_error()
return
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
- return [TYPE_TEXT, tokenize_character_reference()]
+ return new_text_node tokenize_character_reference()
when '<'
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
- return [TYPE_TEXT, c]
+ return new_text_node c
when '' # EOF
- return [TYPE_EOF]
+ return new_eof_token()
else
- return [TYPE_TEXT, c]
+ return new_text_node c
return null
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+ tok_cur_tag = new_open_tag c
tok_state = tok_state_tag_name
else if uc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+ tok_cur_tag = new_open_tag c.toLowerCase()
tok_state = tok_state_tag_name
else
parse_error()
tok_state = tok_state_data
cur -= 1 # we didn't parse/handle the char after <
- return [TYPE_TEXT, '<']
+ return new_text_node '<'
return null
# 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
when '' # EOF
parse_error()
tok_state = tok_state_data
- return [TYPE_TEXT, '</']
+ return new_text_node '</'
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
+ tok_cur_tag = new_end_tag c.toLowerCase()
tok_state = tok_state_tag_name
else if lc_alpha.indexOf(c) > -1
- tok_cur_tag = [TYPE_END_TAG, c, [], []]
+ tok_cur_tag = new_end_tag c
tok_state = tok_state_tag_name
else
parse_error()
return tmp
when "\u0000"
parse_error()
- tok_cur_tag[1] += "\ufffd"
+ tok_cur_tag.name += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag[1] += c.toLowerCase()
+ tok_cur_tag.name += c.toLowerCase()
else
- tok_cur_tag[1] += c
+ tok_cur_tag.name += c
return null
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
else
attr_name = c
if attr_name?
- tok_cur_tag[2].unshift [attr_name, '']
+ tok_cur_tag.attrs_a.unshift [attr_name, '']
tok_state = tok_state_attribute_name
return null
return tmp
when "\u0000"
parse_error()
- tok_cur_tag[2][0][0] = "\ufffd"
+ tok_cur_tag.attrs_a[0][0] = "\ufffd"
when '"', "'", '<'
parse_error()
- tok_cur_tag[2][0][0] = c
+ tok_cur_tag.attrs_a[0][0] = c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag[2][0][0] = c.toLowerCase()
+ tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
else
- tok_cur_tag[2][0][0] += c
+ tok_cur_tag.attrs_a[0][0] += c
return null
# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
tok_state = tok_state_attribute_value_single_quoted
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
tok_state = tok_state_attribute_value_unquoted
when '>'
# Parse error
parse_error()
tok_state = tok_state_data
else
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
tok_state = tok_state_attribute_value_unquoted
return null
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
when "'"
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
when "\u0000"
# Parse error
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '&'
- tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
+ tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
when "\u0000"
- tok_cur_tag[2][0][1] += "\ufffd"
+ tok_cur_tag.attrs_a[0][1] += "\ufffd"
when '' # EOF
parse_error()
tok_state = tok_state_data
else
# Parse Error if ', <, = or ` (backtick)
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag.attrs_a[0][1] += c
return null
# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
- tree = [TYPE_TAG, 'html', {}, []]
+ tree = new Node TYPE_TAG, name: 'html'
open_tags = [tree]
tree_state = tree_in_body
flag_frameset_ok = true
t = tok_state()
if t?
tree_state t
- return tree[3]
+ return tree.children
# everything below is tests on the above
test_equals = (description, output, expected_output) ->
errors_cb = (i) ->
parse_errors.push i
parsed = parse_html args.html, errors_cb
- parsed = JSON.stringify parsed
- if parsed isnt args.expected or parse_errors.length isnt args.errors
+ serialized = ''
+ sep = ''
+ for t in parsed
+ serialized += sep
+ sep = ','
+ serialized += t.serialize()
+ if serialized isnt args.expected or parse_errors.length isnt args.errors
console.log "test FAILED: \"#{args.name}\""
else
console.log 'test passed'
- if parsed isnt args.expected
+ if serialized isnt args.expected
console.log " Input: #{args.html}"
console.log " Correct: #{args.expected}"
- console.log " Output: #{parsed}"
+ console.log " Output: #{serialized}"
if parse_errors.length isnt args.errors
console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
test_parser name: "empty", \
html: "",
- expected: '[]',
+ expected: '',
errors: 0
test_parser name: "just text", \
html: "abc",
- expected: '[[1,"abc"]]',
+ expected: 'text:"abc"',
errors: 0
test_parser name: "named entity", \
html: "a&1234",
- expected: '[[1,"a&1234"]]',
+ expected: 'text:"a&1234"',
errors: 0
test_parser name: "broken named character references", \
html: "1&2&&3&aabbcc;",
- expected: '[[1,"1&2&&3&aabbcc;"]]',
+ expected: 'text:"1&2&&3&aabbcc;"',
errors: 2
test_parser name: "numbered entity overrides", \
html: "1€€ ƒ",
- expected: '[[1,"1€€ ƒ"]]',
+ expected: 'text:"1€€ ƒ"',
errors: 0
test_parser name: "open tag", \
html: "foo<span>bar",
- expected: '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]',
+ expected: 'text:"foo",tag:"span",{},[text:"bar"]',
errors: 1 # no close tag
test_parser name: "open tag with attributes", \
html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
- expected: '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]',
+ expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
errors: 1 # no close tag
test_parser name: "open tag with attributes of various quotings", \
html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
- expected: '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]',
+ expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
errors: 1 # no close tag
test_parser name: "attribute entity exceptions dq", \
html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
- expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]',
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
errors: 2 # no close tag, &= in attr
test_parser name: "attribute entity exceptions sq", \
html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
- expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]',
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
errors: 2 # no close tag, &= in attr
test_parser name: "attribute entity exceptions uq", \
html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
- expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]',
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
errors: 2 # no close tag, &= in attr
test_parser name: "matching closing tags", \
html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
- expected: '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]',
+ expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
errors: 0
test_parser name: "mis-matched closing tags", \
html: "foo<div>bar<span>baz</div>qux",
- expected: '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]',
+ expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
errors: 1 # close tag mismatch