TYPE_COMMENT = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
+TYPE_EOF = 5
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
tok_state = null
tok_cur_tag = null # partially parsed tag
+ parse_error = ->
+ console.log "Parse error at character #{cur} of #{txt.length}"
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
- tok_state = tok_state_character_reference_in_data
+ return [TYPE_TEXT, tokenize_character_reference()]
when '<'
tok_state = tok_state_tag_open
when "\u0000"
- # Parse error
+ parse_error()
return [TYPE_TEXT, c]
+ when '' # EOF
+ return [TYPE_EOF]
else
return [TYPE_TEXT, c]
return null
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
- # & just got consumed
- tok_state_character_reference_in_data = ->
- tok_state = tok_state_data
- if cur >= txt.length
- return [TYPE_TEXT, '&']
- switch c = txt.charAt(cur)
- when ';'
- return [TYPE_TEXT, '&']
- when '#'
- if cur + 1 >= txt.length
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + 1).toLowerCase() is 'x'
- prefix = '#x'
- charset = hex_chars
- start = cur + 2
- else
- charset = digits
- start = cur + 1
- prefix = '#'
- i = 0
- while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
- i += 1
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(start + i) is ';'
- i += 1
- decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
- if decoded?
- cur = start + i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- for i in [0...31]
- if alnum.indexOf(txt.charAt(cur + i)) is -1
- break
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + i) is ';'
- i += 1 # include ';' terminator in value
- decoded = decode_named_char_ref txt.substr(cur, i)
- if decoded?
- cur += i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- # no ';' terminator (only legacy char refs)
- if i < 2 or i > 6
- return [TYPE_TEXT, '&']
- # FIXME: if we're inside an attribute:
- # 1. don't parse refs that are followed by =
- # 2. don't parse refs that are followed by alnum
- max = i
- for i in [2..max] # no prefix matches, so ok to check shortest first
- c = legacy_char_refs[txt.substr(cur, i)]
- if c?
- cur += i # consume entity chars
- return [TYPE_TEXT, c]
- return null
+ # not needed: tok_state_character_reference_in_data = ->
+ # just call tok_state_character_reference_in_data()
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
when '/'
tok_state = tok_state_end_tag_open
when '?'
- # Parse error
+ parse_error()
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
tok_state = tok_state_tag_name
else
- # Parse error
+ parse_error()
tok_state = tok_state_data
cur -= 1 # we didn't parse/handle the char after <
return [TYPE_TEXT, '<']
tok_cur_tag = null
return tmp
when "\u0000"
- # Parse error
+ parse_error()
tok_cur_tag[1] += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
tok_cur_tag[1] += c.toLowerCase()
tok_cur_tag = null
return tmp
when "\u0000"
- # Parse error
+ parse_error()
attr_name = "\ufffd"
when '"', "'", '<', '='
- # Parse error
+ parse_error()
attr_name = c
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
attr_name = c.toLowerCase()
tok_cur_tag = null
return tmp
when "\u0000"
- # Parse error
- tok_cur_tag[2][0][0] += "\ufffd"
+ parse_error()
+ tok_cur_tag[2][0][0] = "\ufffd"
+ when '"', "'", '<'
+ parse_error()
+ tok_cur_tag[2][0][0] = c
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
if uc_alpha.indexOf(c) > -1
- tok_cur_tag[2][0][0] += c.toLowerCase()
+ tok_cur_tag[2][0][0] = c.toLowerCase()
else
- # Parse error if ", ' or <
tok_cur_tag[2][0][0] += c
return null
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
tok_cur_tag[2][0][1] += c
tok_state = tok_state_attribute_value_unquoted
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_state = tok_state_character_reference_in_attribute_value
- tok_char_ref_addl_allowed = '"' # FIXME
+ tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
when "\u0000"
# Parse error
tok_cur_tag[2][0][1] += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
tok_cur_tag[2][0][1] += c
return null
when "'"
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_state = tok_state_character_reference_in_attribute_value
- tok_char_ref_addl_allowed = "'" # FIXME
+ tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
when "\u0000"
# Parse error
tok_cur_tag[2][0][1] += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
tok_cur_tag[2][0][1] += c
return null
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '&'
- tok_state = tok_state_character_reference_in_attribute_value
- tok_char_ref_addl_allowed = '>' # FIXME
+ tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
return tmp
when "\u0000"
tok_cur_tag[2][0][1] += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
# Parse Error if ', <, = or ` (backtick)
tok_cur_tag[2][0][1] += c
tmp = tok_cur_tag
tok_cur_tag = null
return tmp
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
else
# Parse Error
tok_state = tok_state_before_attribute_name
cur -= 1 # we didn't handle that char
return null
+ # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+ # Don't set this as a state, just call it
+ # returns a string (NOT a text node)
+ tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+ if cur >= txt.length
+ return '&'
+ switch c = txt.charAt(cur)
+ when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
+ # explicitly not a parse error
+ return '&'
+ when ';'
+ # there has to be "one or more" alnums between & and ; to be a parse error
+ return '&'
+ when '#'
+ if cur + 1 >= txt.length
+ return '&'
+ if txt.charAt(cur + 1).toLowerCase() is 'x'
+ prefix = '#x'
+ charset = hex_chars
+ start = cur + 2
+ else
+ charset = digits
+ start = cur + 1
+ prefix = '#'
+ i = 0
+ while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+ i += 1
+ if i is 0
+ return '&'
+ if txt.charAt(start + i) is ';'
+ i += 1
+ # FIXME This is supposed to generate parse errors for some chars
+ decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+ if decoded?
+ cur = start + i
+ return decoded
+ return '&'
+ else
+ for i in [0...31]
+ if alnum.indexOf(txt.charAt(cur + i)) is -1
+ break
+ if i is 0
+ # exit early, because parse_error() below needs at least one alnum
+ return '&'
+ if txt.charAt(cur + i) is ';'
+ i += 1 # include ';' terminator in value
+ decoded = decode_named_char_ref txt.substr(cur, i)
+ if decoded?
+ cur += i
+ return decoded
+ parse_error()
+ return '&'
+ else
+ # no ';' terminator (only legacy char refs)
+ max = i
+ for i in [2..max] # no prefix matches, so ok to check shortest first
+ c = legacy_char_refs[txt.substr(cur, i)]
+ if c?
+ if in_attr
+ if txt.charAt(cur + i) is '='
+ # "because some legacy user agents will
+ # misinterpret the markup in those cases"
+ parse_error()
+ return '&'
+ if alnum.indexOf(txt.charAt(cur + i)) > -1
+ # this makes attributes forgiving about url args
+ return '&'
+ # ok, and besides the weird exceptions for attributes...
+ # return the matching char
+ cur += i # consume entity chars
+ parse_error() # because no terminating ";"
+ return c
+ parse_error()
+ return '&'
+ return # never reached
+
# the functions below impliment the Tree Contstruction algorithm here:
# http://www.w3.org/TR/html5/syntax.html#tree-construction
# FIXME this is just a bit of a hack that makes sense... read spec and do it that way
tree_append = (t) ->
- if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
- tree_append_point[tree_append_point.length - 1][1] += t[1]
- else
- tree_append_point.push t
- if t[0] is TYPE_OPEN_TAG
+ switch t[0]
+ when TYPE_TEXT
+ if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
+ tree_append_point[tree_append_point.length - 1][1] += t[1]
+ else
+ tree_append_point.push t
+ when TYPE_OPEN_TAG
t[0] = TYPE_TAG
+ # convert attributes into a hash
attrs = {}
while t[2].length
a = t[2].pop()
attrs[a[0]] = a[1]
t[2] = attrs
+ tree_append_point.push t
tree_append_point = t[3]
+ # TODO implement stack of open elements
+ # TODO implement formatting elements thing
+ when TYPE_EOF
+ return
+ # TODO implement close tags
+ # TODO implement self-closing tags
+ else
+ console.log "UNIMPLEMENTED tag type: #{t[0]}"
# tree constructor initialization
tree = [] # see comments on TYPE_TAG/etc for the structure of this data
tok_state = tok_state_data
# proccess input
- while cur < txt.length
+ loop
t = tok_state()
if t?
tree_state t
-
- return tree
+ if t[0] is TYPE_EOF
+ return tree
+ return # never reached
# everything below is tests on the above
test_equals = (description, fn, args..., expected_output) ->
if output is expected_output
console.log "passed: #{description}."
else
- console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
+ console.log "FAILED: #{description}..."
+ console.log " Expected: #{expected_output}"
+ console.log " Actual: #{output}"
html_to_json = (html) ->
return JSON.stringify parse_html html
test_equals "empty", html_to_json, "", '[]'
test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'