attrs = {}
attrs[k] = v for k, v of @attrs
return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
+ acknowledge_self_closing: ->
+ # fixfull
serialize: (shallow = false, show_ids = false) -> # for unit tests
ret = ''
switch @type
return new Node TYPE_TAG, name: name
new_text_node = (txt) ->
return new Node TYPE_TEXT, text: txt
+new_character_token = new_text_node
new_comment_node = (txt) ->
return new Node TYPE_COMMENT, text: txt
new_eof_token = ->
new_aaa_bookmark = ->
return new Node TYPE_AAA_BOOKMARK
-lc_alpha = "abcdefghijklmnopqrstuvwxqz"
-uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
+lc_alpha = "abcdefghijklmnopqrstuvwxyz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits = "0123456789"
alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
- open_els = [] # stack of open elements
+ open_els = null # stack of open elements
+ afe = null # active formatting elements
+ template_insertion_modes = null
insertion_mode = null
+ original_insertion_mode = null
tok_state = null
tok_cur_tag = null # partially parsed tag
+ flag_scripting = null
flag_frameset_ok = null
flag_parsing = null
flag_foster_parenting = null
form_element_pointer = null
- afe = [] # active formatting elements
+ temporary_buffer = null
parse_error = ->
if parse_error_cb?
debug_log "AAA DONE"
# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
- # FIXME test this (particularly emplied end tags)
close_p_element = ->
generate_implied_end_tags 'p' # arg is exception
if open_els[0].name isnt 'p'
close_p_element()
# http://www.w3.org/TR/html5/syntax.html#insert-a-character
- tree_insert_text = (t) ->
+ # aka insert_a_character = (t) ->
+ insert_character = (t) ->
dest = adjusted_insertion_location()
# fixfull check for Document node
if dest[1] > 0
# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
# position should be [node, index_within_children]
- tree_insert_a_comment = (t, position = null) ->
+ tree_insert_comment = (t, position = null) ->
position ?= adjusted_insertion_location()
position[0].children.splice position[1], 0, t
+ # 8.2.5.2
+ # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
+ parse_generic_raw_text = (t) ->
+ insert_html_element t
+ tok_state = tok_state_rawtext
+ original_insertion_mode = insertion_mode
+ insertion_mode = ins_mode_text
+ parse_generic_rcdata_text = (t) ->
+ insert_html_element t
+ tok_state = tok_state_rcdata
+ original_insertion_mode = insertion_mode
+ insertion_mode = ins_mode_text
+
# 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
# http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
generate_implied_end_tags = (except = null) ->
while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
open_els.shift()
- # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+ # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
+ ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
+ open_els.shift() # spec says this will be a 'head' node
+ insertion_mode = ins_mode_after_head
+ insertion_mode t
+ ins_mode_in_head = (t) ->
+ if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
+ insert_character t
+ return
+ if t.type is TYPE_COMMENT
+ tree_insert_comment t
+ return
+ if t.type is TYPE_DOCTYPE
+ parse_error()
+ return
+ if t.type is TYPE_START_TAG and t.name is 'html'
+ ins_mode_in_body t
+ return
+ if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
+ el = insert_html_element t
+ open_els.shift()
+ el.acknowledge_self_closing()
+ return
+ if t.type is TYPE_START_TAG and t.name is 'meta'
+ el = insert_html_element t
+ open_els.shift()
+ el.acknowledge_self_closing()
+ # fixfull encoding stuff
+ return
+ if t.type is TYPE_START_TAG and t.name is 'title'
+ parse_generic_rcdata_element t
+ return
+ if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+ parse_generic_raw_text t
+ return
+ if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
+ insert_html_element t
+ insertion_mode = in_head_noscript # FIXME implement
+ return
+ if t.type is TYPE_START_TAG and t.name is 'script'
+ ail = adjusted_insertion_location()
+ el = token_to_element t, NS_HTML, ail
+ el.flag_parser_inserted true # FIXME implement
+ # fixfull frament case
+ ail[0].children.splice ail[1], 0, el
+ open_els.unshift el
+ tok_state = tok_state_script_data
+ original_insertion_mode = insertion_mode # make sure orig... is defined
+ insertion_mode = ins_mode_text # FIXME implement
+ return
+ if t.type is TYPE_END_TAG and t.name is 'head'
+ open_els.shift() # will be a head element... spec says so
+ insertion_mode = ins_mode_after_head
+ return
+ if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
+ ins_mode_in_head_else t
+ return
+ if t.type is TYPE_START_TAG and t.name is 'template'
+ insert_html_element t
+ afe_push_marker()
+ flag_frameset_ok = false
+ insertion_mode = ins_mode_in_template
+ template_insertion_modes.unshift ins_mode_in_template # FIXME implement
+ return
+ if t.type is TYPE_END_TAG and t.name is 'template'
+ if template_tag_is_open()
+ generate_implied_end_tags
+ if open_els[0].name isnt 'template'
+ parse_error()
+ loop
+ el = open_els.shift()
+ if el.name is 'template'
+ break
+ clear_afe_to_marker()
+ template_insertion_modes.shift()
+ reset_insertion_mode()
+ else
+ parse_error()
+ return
+ if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
+ parse_error()
+ return
+ ins_mode_in_head_else t
+
+ # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
for node, i in open_els
if node.name is name # FIXME check namespace too
parse_error()
when "\t", "\u000a", "\u000c", "\u000d", ' '
reconstruct_active_formatting_elements()
- tree_insert_text t
+ insert_character t
else
reconstruct_active_formatting_elements()
- tree_insert_text t
+ insert_character t
flag_frameset_ok = false
when TYPE_COMMENT
- tree_insert_a_comment t
+ tree_insert_comment t
when TYPE_DOCTYPE
parse_error()
when TYPE_START_TAG
root_attrs[k] = v unless root_attrs[k]?
when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
# FIXME also do this for </template> (end tag)
- return tree_in_head t
+ return ins_mode_in_head t
when 'body'
parse_error()
# TODO
el = afe.shift()
if el.type is TYPE_AFE_MARKER
return
+
+ # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
+ ins_mode_text = (t) ->
+ if t.type is TYPE_TEXT
+ insert_character t
+ return
+ if t.type is TYPE_EOF
+ parse_error()
+ if open_els[0].name is 'script'
+ open_els[0].flag 'already started', true
+ open_els.shift()
+ insertion_mode = original_insertion_mode
+ insertion_mode t
+ return
+ if t.type is TYPE_END_TAG and t.name is 'script'
+ open_els.shift()
+ insertion_mode = original_insertion_mode
+ # fixfull the spec seems to assume that I'm going to run the script
+ # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
+ return
+ if t.type is TYPE_END_TAG
+ open_els.shift()
+ insertion_mode = original_insertion_mode
+ return
+ console.log 'warning: end of ins_mode_text reached'
+
+ # the functions below implement the tokenizer stats described here:
+ # http://www.w3.org/TR/html5/syntax.html#tokenization
+
+ # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
ins_mode_in_table = (t) ->
switch t.type
when TYPE_TEXT
else
ins_mode_in_table_else t
when TYPE_COMMENT
- tree_insert_a_comment t
+ tree_insert_comment t
when TYPE_DOCTYPE
parse_error()
when TYPE_START_TAG
ins_mode_in_table_else t
else
parse_error()
- insert_html_element t
+ el = insert_html_element t
open_els.shift()
- # fixfull acknowledge sef-closing flag
+ el.acknowledge_self_closing()
when 'form'
parse_error()
if form_element_pointer?
ins_mode_in_table_else t
+ # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
ins_mode_in_table_text = (t) ->
switch t.type
when TYPE_TEXT
console.log "unimplemented ins_mode_in_table_text"
# FIXME CONTINUE
+ # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
ins_mode_in_table_body = (t) ->
if t.type is TYPE_START_TAG and t.name is 'tr'
clear_stack_to_table_body_context()
# Anything else
ins_mode_in_table t
+ # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
ins_mode_in_row = (t) ->
if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
clear_stack_to_table_row_context()
clear_afe_to_marker()
insertion_mode = ins_mode_in_row
- # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
+ # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
ins_mode_in_cell = (t) ->
if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
if is_in_table_scope t.name
# Anything Else
ins_mode_in_body t
-
- # the functions below implement the tokenizer stats described here:
- # http://www.w3.org/TR/html5/syntax.html#tokenization
-
# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
- return new_text_node tokenize_character_reference()
+ return new_text_node parse_character_reference()
when '<'
tok_state = tok_state_tag_open
when "\u0000"
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
# not needed: tok_state_character_reference_in_data = ->
- # just call tok_state_character_reference_in_data()
+ # just call parse_character_reference()
+
+ # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
+ tok_state_rcdata = ->
+ switch c = txt.charAt(cur++)
+ when '&'
+ return new_text_node parse_character_reference()
+ when '<'
+ tok_state = tok_state_rcdata_less_than_sign
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
+ # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
+ # not needed: tok_state_character_reference_in_rcdata = ->
+ # just call parse_character_reference()
+
+ # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
+ tok_state_rawtext = ->
+ switch c = txt.charAt(cur++)
+ when '<'
+ tok_state = tok_state_rawtext_less_than_sign
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
+ # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
+ tok_state_script_data = ->
+ switch c = txt.charAt(cur++)
+ when '<'
+ tok_state = tok_state_script_data_less_than_sign
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
+ # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
+ tok_state_plaintext = ->
+ switch c = txt.charAt(cur++)
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
tok_cur_tag.name += c
return null
+ # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
+ tok_state_rcdata_less_than_sign = ->
+ c = txt.charAt(cur++)
+ if c is '/'
+ temporary_buffer = ''
+ tok_state = tok_state_rcdata_end_tag_open
+ return null
+ # Anything else
+ tok_state = tok_state_rcdata
+ cur -= 1 # reconsume the input character
+ return new_character_token '<'
+
+ # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
+ tok_state_rcdata_end_tag_open = ->
+ c = txt.charAt(cur++)
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ temporary_buffer += c
+ tok_state = tok_state_rcdata_end_tag_name
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c
+ temporary_buffer += c
+ tok_state = tok_state_rcdata_end_tag_name
+ return null
+ # Anything else
+ tok_state = tok_state_rcdata
+ cur -= 1 # reconsume the input character
+ return new_character_token "</" # fixfull separate these
+
+ # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
+ is_appropriate_end_tag = (t) ->
+ # spec says to check against "the tag name of the last start tag to
+ # have been emitted from this tokenizer", but this is only called from
+ # the various "raw" states, which I'm pretty sure all push the start
+ # token onto open_els. TODO: verify this after the script data states
+ # are implemented
+ debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+ return t.type is TYPE_END_TAG and t.name is open_els[0].name
+
+ # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
+ tok_state_rcdata_end_tag_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_before_attribute_name
+ return
+ # else fall through to "Anything else"
+ if c is '/'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
+ return
+ # else fall through to "Anything else"
+ if c is '>'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_data
+ return tok_cur_tag
+ # else fall through to "Anything else"
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c.toLowerCase()
+ temporary_buffer += c
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c
+ temporary_buffer += c
+ return null
+ # Anything else
+ tok_state = tok_state_rcdata
+ cur -= 1 # reconsume the input character
+ return new_character_token '</' + temporary_buffer # fixfull separate these
+
+ # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
+ tok_state_rawtext_less_than_sign = ->
+ c = txt.charAt(cur++)
+ if c is '/'
+ temporary_buffer = ''
+ tok_state = tok_state_rawtext_end_tag_open
+ return null
+ # Anything else
+ tok_state = tok_state_rawtext
+ cur -= 1 # reconsume the input character
+ return new_character_token '<'
+
+ # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
+ tok_state_rawtext_end_tag_open = ->
+ c = txt.charAt(cur++)
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ temporary_buffer += c
+ tok_state = tok_state_rawtext_end_tag_name
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c
+ temporary_buffer += c
+ tok_state = tok_state_rawtext_end_tag_name
+ return null
+ # Anything else
+ tok_state = tok_state_rawtext
+ cur -= 1 # reconsume the input character
+ return new_character_token "</" # fixfull separate these
+
+ # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
+ tok_state_rawtext_end_tag_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_before_attribute_name
+ return
+ # else fall through to "Anything else"
+ if c is '/'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_self_closing_start_tag
+ return
+ # else fall through to "Anything else"
+ if c is '>'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_data
+ return tok_cur_tag
+ # else fall through to "Anything else"
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c.toLowerCase()
+ temporary_buffer += c
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c
+ temporary_buffer += c
+ return null
+ # Anything else
+ tok_state = tok_state_rawtext
+ cur -= 1 # reconsume the input character
+ return new_character_token '</' + temporary_buffer # fixfull separate these
+
+ # TODO _all_ of the missing states here (17-33) are for parsing script tags
+
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
tok_state_before_attribute_name = ->
attr_name = null
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
+ tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
when "\u0000"
# Parse error
tok_cur_tag.attrs_a[0][1] += "\ufffd"
when "'"
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
+ tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
when "\u0000"
# Parse error
tok_cur_tag.attrs_a[0][1] += "\ufffd"
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '&'
- tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
+ tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
# returns a string (NOT a text node)
- tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+ parse_character_reference = (allowed_char = null, in_attr = false) ->
if cur >= txt.length
return '&'
switch c = txt.charAt(cur)
# see comments on TYPE_TAG/etc for the structure of this data
tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
open_els = [tree]
+ afe = [] # active formatting elements
+ template_insertion_modes = []
insertion_mode = ins_mode_in_body
+ original_insertion_mode = insertion_mode # TODO check spec
+ flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
flag_frameset_ok = true
flag_parsing = true
flag_foster_parenting = false
form_element_pointer = null
- afe = [] # active formatting elements
+ temporary_buffer = null
# tokenizer initialization
tok_state = tok_state_data
expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
test_parser name: "open tag with attributes of various quotings", \
html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
- expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
+ expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
test_parser name: "attribute entity exceptions dq", \
html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
test_parser name: "junk after attribute close-quote", \
html: '<p><b c="d", e="f">foo<p>x',
expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
+test_parser name: "html5lib aaa02 1", \
+ html: '<b>1<i>2<p>3</b>4',
+ expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
+test_parser name: "html5lib aaa02 2", \
+ html: '<a><div><style></style><address><a>',
+ expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'