attrs = {}
attrs[k] = v for k, v of @attrs
return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
+ acknowledge_self_closing: ->
+ # fixfull
serialize: (shallow = false, show_ids = false) -> # for unit tests
ret = ''
switch @type
ret += "##{@id},"
if shallow
break
- ret += JSON.stringify @attrs
- ret += ',['
+ attr_keys = []
+ for k of @attrs
+ attr_keys.push k
+ attr_keys.sort()
+ ret += '{'
+ sep = ''
+ for k in attr_keys
+ ret += sep
+ sep = ','
+ ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
+ ret += '},['
sep = ''
for c in @children
ret += sep
return new Node TYPE_TAG, name: name
new_text_node = (txt) ->
return new Node TYPE_TEXT, text: txt
+new_character_token = new_text_node
new_comment_node = (txt) ->
return new Node TYPE_COMMENT, text: txt
new_eof_token = ->
new_aaa_bookmark = ->
return new Node TYPE_AAA_BOOKMARK
-lc_alpha = "abcdefghijklmnopqrstuvwxqz"
-uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
+lc_alpha = "abcdefghijklmnopqrstuvwxyz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
digits = "0123456789"
alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
- open_els = [] # stack of open elements
+ open_els = null # stack of open elements
+ afe = null # active formatting elements
+ template_insertion_modes = null
insertion_mode = null
+ original_insertion_mode = null
tok_state = null
tok_cur_tag = null # partially parsed tag
+ flag_scripting = null
flag_frameset_ok = null
flag_parsing = null
flag_foster_parenting = null
form_element_pointer = null
- afe = [] # active formatting elements
+ temporary_buffer = null
parse_error = ->
if parse_error_cb?
else
console.log "Parse error at character #{cur} of #{txt.length}"
+ afe_push = (new_el) ->
+ matches = 0
+ for el, i in afe
+ if el.name is new_el.name and el.namespace is new_el.namespace
+ for k, v of el.attrs
+ continue unless new_el.attrs[k] is v
+ for k, v of new_el.attrs
+ continue unless el.attrs[k] is v
+ matches += 1
+ if matches is 3
+ afe.splice i, 1
+ break
+ afe.unshift new_el
+ afe_push_marker = ->
+ afe.unshift new_afe_marker()
# the functions below impliment the Tree Contstruction algorithm
# http://www.w3.org/TR/html5/syntax.html#tree-construction
tree_insert_element el
afe[i] = el
break if i is 0
- i -= 1
+ i -= 1 # Advance
# http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
# adoption agency algorithm
# http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
# http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
adoption_agency = (subject) ->
+ debug_log "adoption_agency()"
+ debug_log "tree: #{serialize_els tree.children, false, true}"
+ debug_log "open_els: #{serialize_els open_els, true, true}"
+ debug_log "afe: #{serialize_els afe, true, true}"
if open_els[0].name is subject
el = open_els[0]
open_els.shift()
if t is el
afe.splice i, 1
break
+ debug_log "aaa: starting off with subject on top of stack, exiting"
return
outer = 0
loop
# If there is no such element, then abort these steps and instead
# act as described in the "any other end tag" entry above.
if fe is null
+ debug_log "aaa: fe not found in afe"
in_body_any_other_end_tag subject
return
# 6. If formatting element is not in the stack of open elements,
in_open_els = true
break
unless in_open_els
+ debug_log "aaa: fe not found in open_els"
parse_error()
# "remove it from the list" must mean afe, since it's not in open_els
afe.splice fe_of_afe, 1
# the element is not in scope, then this is a parse error; abort
# these steps.
unless el_is_in_scope fe
+ debug_log "aaa: fe not in scope"
parse_error()
return
# 8. If formatting element is not the current node, this is a parse
# formatting element from the list of active formatting elements,
# and finally abort these steps.
if fb is null
+ debug_log "aaa: no fb"
loop
t = open_els.shift()
if t is fe
break
node = node_next ? node_above
debug_log "inner loop #{inner}"
- debug_log "open_els: #{serialize_els open_els, true, true}"
debug_log "tree: #{serialize_els tree.children, false, true}"
+ debug_log "open_els: #{serialize_els open_els, true, true}"
debug_log "afe: #{serialize_els afe, true, true}"
debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
debug_log "AAA DONE"
# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
- # FIXME test this (particularly emplied end tags)
close_p_element = ->
generate_implied_end_tags 'p' # arg is exception
if open_els[0].name isnt 'p'
parse_error()
while open_els.length > 1 # just in case
- t = open_els.shift()
- if t.name is 'p'
+ el = open_els.shift()
+ if el.name is 'p'
return
close_p_if_in_button_scope = ->
if is_in_button_scope 'p'
- close_a_p_element()
+ close_p_element()
# http://www.w3.org/TR/html5/syntax.html#insert-a-character
- tree_insert_text = (t) ->
+ # aka insert_a_character = (t) ->
+ insert_character = (t) ->
dest = adjusted_insertion_location()
+ # fixfull check for Document node
if dest[1] > 0
prev = dest[0].children[dest[1] - 1]
if prev.type is TYPE_TEXT
# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
# position should be [node, index_within_children]
- tree_insert_a_comment = (t, position = null) ->
+ tree_insert_comment = (t, position = null) ->
position ?= adjusted_insertion_location()
position[0].children.splice position[1], 0, t
+ # 8.2.5.2
+ # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
+ parse_generic_raw_text = (t) ->
+ insert_html_element t
+ tok_state = tok_state_rawtext
+ original_insertion_mode = insertion_mode
+ insertion_mode = ins_mode_text
+ parse_generic_rcdata_text = (t) ->
+ insert_html_element t
+ tok_state = tok_state_rcdata
+ original_insertion_mode = insertion_mode
+ insertion_mode = ins_mode_text
+
# 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
# http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
generate_implied_end_tags = (except = null) ->
- while end_tag_implied[open_els[0]] and open_els[0].name isnt except
+ while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
open_els.shift()
- # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+ # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
+ ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
+ open_els.shift() # spec says this will be a 'head' node
+ insertion_mode = ins_mode_after_head
+ insertion_mode t
+ ins_mode_in_head = (t) ->
+ if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
+ insert_character t
+ return
+ if t.type is TYPE_COMMENT
+ tree_insert_comment t
+ return
+ if t.type is TYPE_DOCTYPE
+ parse_error()
+ return
+ if t.type is TYPE_START_TAG and t.name is 'html'
+ ins_mode_in_body t
+ return
+ if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
+ el = insert_html_element t
+ open_els.shift()
+ el.acknowledge_self_closing()
+ return
+ if t.type is TYPE_START_TAG and t.name is 'meta'
+ el = insert_html_element t
+ open_els.shift()
+ el.acknowledge_self_closing()
+ # fixfull encoding stuff
+ return
+ if t.type is TYPE_START_TAG and t.name is 'title'
+ parse_generic_rcdata_element t
+ return
+ if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+ parse_generic_raw_text t
+ return
+ if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
+ insert_html_element t
+ insertion_mode = in_head_noscript # FIXME implement
+ return
+ if t.type is TYPE_START_TAG and t.name is 'script'
+ ail = adjusted_insertion_location()
+ el = token_to_element t, NS_HTML, ail
+ el.flag_parser_inserted true # FIXME implement
+ # fixfull frament case
+ ail[0].children.splice ail[1], 0, el
+ open_els.unshift el
+ tok_state = tok_state_script_data
+ original_insertion_mode = insertion_mode # make sure orig... is defined
+ insertion_mode = ins_mode_text # FIXME implement
+ return
+ if t.type is TYPE_END_TAG and t.name is 'head'
+ open_els.shift() # will be a head element... spec says so
+ insertion_mode = ins_mode_after_head
+ return
+ if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
+ ins_mode_in_head_else t
+ return
+ if t.type is TYPE_START_TAG and t.name is 'template'
+ insert_html_element t
+ afe_push_marker()
+ flag_frameset_ok = false
+ insertion_mode = ins_mode_in_template
+ template_insertion_modes.unshift ins_mode_in_template # FIXME implement
+ return
+ if t.type is TYPE_END_TAG and t.name is 'template'
+ if template_tag_is_open()
+ generate_implied_end_tags
+ if open_els[0].name isnt 'template'
+ parse_error()
+ loop
+ el = open_els.shift()
+ if el.name is 'template'
+ break
+ clear_afe_to_marker()
+ template_insertion_modes.shift()
+ reset_insertion_mode()
+ else
+ parse_error()
+ return
+ if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
+ parse_error()
+ return
+ ins_mode_in_head_else t
+
+ # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
for node, i in open_els
if node.name is name # FIXME check namespace too
parse_error()
when "\t", "\u000a", "\u000c", "\u000d", ' '
reconstruct_active_formatting_elements()
- tree_insert_text t
+ insert_character t
else
reconstruct_active_formatting_elements()
- tree_insert_text t
+ insert_character t
flag_frameset_ok = false
when TYPE_COMMENT
- tree_insert_a_comment t
+ tree_insert_comment t
when TYPE_DOCTYPE
parse_error()
when TYPE_START_TAG
root_attrs[k] = v unless root_attrs[k]?
when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
# FIXME also do this for </template> (end tag)
- return tree_in_head t
+ return ins_mode_in_head t
when 'body'
parse_error()
# TODO
if el is found
open_els.splice i, 1
reconstruct_active_formatting_elements()
- el = tree_insert_element t
- afe.unshift el
+ el = insert_html_element t
+ afe_push el
when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
reconstruct_active_formatting_elements()
- el = tree_insert_element t
- afe.unshift el
+ el = insert_html_element t
+ afe_push el
when 'table'
# fixfull quirksmode thing
close_p_if_in_button_scope()
# TODO lots more to implement here
else # any other start tag
reconstruct_active_formatting_elements()
- tree_insert_element t
+ insert_html_element t
when TYPE_EOF
ok_tags = {
dd: true, dt: true, li: true, p: true, tbody: true, td: true,
unless is_in_button_scope 'p'
parse_error()
insert_html_element new_open_tag 'p'
- close_p_element()
+ close_p_element()
# TODO lots more close tags to implement here
when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
adoption_agency t.name
el = afe.shift()
if el.type is TYPE_AFE_MARKER
return
+
+ # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
+ ins_mode_text = (t) ->
+ if t.type is TYPE_TEXT
+ insert_character t
+ return
+ if t.type is TYPE_EOF
+ parse_error()
+ if open_els[0].name is 'script'
+ open_els[0].flag 'already started', true
+ open_els.shift()
+ insertion_mode = original_insertion_mode
+ insertion_mode t
+ return
+ if t.type is TYPE_END_TAG and t.name is 'script'
+ open_els.shift()
+ insertion_mode = original_insertion_mode
+ # fixfull the spec seems to assume that I'm going to run the script
+ # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
+ return
+ if t.type is TYPE_END_TAG
+ open_els.shift()
+ insertion_mode = original_insertion_mode
+ return
+ console.log 'warning: end of ins_mode_text reached'
+
+ # the functions below implement the tokenizer stats described here:
+ # http://www.w3.org/TR/html5/syntax.html#tokenization
+
+ # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
ins_mode_in_table = (t) ->
switch t.type
when TYPE_TEXT
else
ins_mode_in_table_else t
when TYPE_COMMENT
- tree_insert_a_comment t
+ tree_insert_comment t
when TYPE_DOCTYPE
parse_error()
when TYPE_START_TAG
switch t.name
when 'caption'
clear_stack_to_table_context()
- afe.unshift new_afe_marker()
+ afe_push_marker()
insert_html_element t
insertion_mode = ins_mode_in_caption
when 'colgroup'
ins_mode_in_table_else t
else
parse_error()
- insert_html_element t
+ el = insert_html_element t
open_els.shift()
- # fixfull acknowledge sef-closing flag
+ el.acknowledge_self_closing()
when 'form'
parse_error()
if form_element_pointer?
ins_mode_in_table_else t
+ # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
ins_mode_in_table_text = (t) ->
switch t.type
when TYPE_TEXT
console.log "unimplemented ins_mode_in_table_text"
# FIXME CONTINUE
+ # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
ins_mode_in_table_body = (t) ->
if t.type is TYPE_START_TAG and t.name is 'tr'
clear_stack_to_table_body_context()
# Anything else
ins_mode_in_table t
+ # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
ins_mode_in_row = (t) ->
if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
clear_stack_to_table_row_context()
insert_html_element t
insertion_mode = ins_mode_in_cell
- afe.unshift new_afe_marker()
+ afe_push_marker()
return
if t.type is TYPE_END_TAG and t.name is 'tr'
if is_in_table_scope 'tr'
clear_afe_to_marker()
insertion_mode = ins_mode_in_row
- # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
+ # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
ins_mode_in_cell = (t) ->
if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
if is_in_table_scope t.name
# Anything Else
ins_mode_in_body t
-
- # the functions below implement the tokenizer stats described here:
- # http://www.w3.org/TR/html5/syntax.html#tokenization
-
# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
- return new_text_node tokenize_character_reference()
+ return new_text_node parse_character_reference()
when '<'
tok_state = tok_state_tag_open
when "\u0000"
# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
# not needed: tok_state_character_reference_in_data = ->
- # just call tok_state_character_reference_in_data()
+ # just call parse_character_reference()
+
+ # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
+ tok_state_rcdata = ->
+ switch c = txt.charAt(cur++)
+ when '&'
+ return new_text_node parse_character_reference()
+ when '<'
+ tok_state = tok_state_rcdata_less_than_sign
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
+ # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
+ # not needed: tok_state_character_reference_in_rcdata = ->
+ # just call parse_character_reference()
+
+ # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
+ tok_state_rawtext = ->
+ switch c = txt.charAt(cur++)
+ when '<'
+ tok_state = tok_state_rawtext_less_than_sign
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
+ # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
+ tok_state_script_data = ->
+ switch c = txt.charAt(cur++)
+ when '<'
+ tok_state = tok_state_script_data_less_than_sign
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
+ # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
+ tok_state_plaintext = ->
+ switch c = txt.charAt(cur++)
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '' # EOF
+ return new_eof_token()
+ else
+ return new_character_token c
+ return null
+
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
tok_cur_tag.name += c
return null
+ # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
+ tok_state_rcdata_less_than_sign = ->
+ c = txt.charAt(cur++)
+ if c is '/'
+ temporary_buffer = ''
+ tok_state = tok_state_rcdata_end_tag_open
+ return null
+ # Anything else
+ tok_state = tok_state_rcdata
+ cur -= 1 # reconsume the input character
+ return new_character_token '<'
+
+ # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
+ tok_state_rcdata_end_tag_open = ->
+ c = txt.charAt(cur++)
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ temporary_buffer += c
+ tok_state = tok_state_rcdata_end_tag_name
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c
+ temporary_buffer += c
+ tok_state = tok_state_rcdata_end_tag_name
+ return null
+ # Anything else
+ tok_state = tok_state_rcdata
+ cur -= 1 # reconsume the input character
+ return new_character_token "</" # fixfull separate these
+
+ # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
+ is_appropriate_end_tag = (t) ->
+ # spec says to check against "the tag name of the last start tag to
+ # have been emitted from this tokenizer", but this is only called from
+ # the various "raw" states, which I'm pretty sure all push the start
+ # token onto open_els. TODO: verify this after the script data states
+ # are implemented
+ debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+ return t.type is TYPE_END_TAG and t.name is open_els[0].name
+
+ # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
+ tok_state_rcdata_end_tag_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_before_attribute_name
+ return
+ # else fall through to "Anything else"
+ if c is '/'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
+ return
+ # else fall through to "Anything else"
+ if c is '>'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_data
+ return tok_cur_tag
+ # else fall through to "Anything else"
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c.toLowerCase()
+ temporary_buffer += c
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c
+ temporary_buffer += c
+ return null
+ # Anything else
+ tok_state = tok_state_rcdata
+ cur -= 1 # reconsume the input character
+ return new_character_token '</' + temporary_buffer # fixfull separate these
+
+ # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
+ tok_state_rawtext_less_than_sign = ->
+ c = txt.charAt(cur++)
+ if c is '/'
+ temporary_buffer = ''
+ tok_state = tok_state_rawtext_end_tag_open
+ return null
+ # Anything else
+ tok_state = tok_state_rawtext
+ cur -= 1 # reconsume the input character
+ return new_character_token '<'
+
+ # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
+ tok_state_rawtext_end_tag_open = ->
+ c = txt.charAt(cur++)
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ temporary_buffer += c
+ tok_state = tok_state_rawtext_end_tag_name
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_end_tag c
+ temporary_buffer += c
+ tok_state = tok_state_rawtext_end_tag_name
+ return null
+ # Anything else
+ tok_state = tok_state_rawtext
+ cur -= 1 # reconsume the input character
+ return new_character_token "</" # fixfull separate these
+
+ # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
+ tok_state_rawtext_end_tag_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_before_attribute_name
+ return
+ # else fall through to "Anything else"
+ if c is '/'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_self_closing_start_tag
+ return
+ # else fall through to "Anything else"
+ if c is '>'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_data
+ return tok_cur_tag
+ # else fall through to "Anything else"
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c.toLowerCase()
+ temporary_buffer += c
+ return null
+ if lc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c
+ temporary_buffer += c
+ return null
+ # Anything else
+ tok_state = tok_state_rawtext
+ cur -= 1 # reconsume the input character
+ return new_character_token '</' + temporary_buffer # fixfull separate these
+
+ # TODO _all_ of the missing states here (17-33) are for parsing script tags
+
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
tok_state_before_attribute_name = ->
attr_name = null
tok_cur_tag.attrs_a[0][0] += c
return null
+ # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
+ tok_state_after_attribute_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+ return
+ if c is '/'
+ tok_state = tok_state_self_closing_start_tag
+ return
+ if c is '='
+ tok_state = tok_state_before_attribute_value
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
+ tok_state = tok_state_attribute_name
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.attrs_a.unshift ["\ufffd", '']
+ tok_state = tok_state_attribute_name
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # reconsume
+ return
+ if c is '"' or c is "'" or c is '<'
+ parse_error()
+ # fall through to Anything else
+ # Anything else
+ tok_cur_tag.attrs_a.unshift [c, '']
+ tok_state = tok_state_attribute_name
+
# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
tok_state_before_attribute_value = ->
switch c = txt.charAt(cur++)
when '"'
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
+ tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
when "\u0000"
# Parse error
tok_cur_tag.attrs_a[0][1] += "\ufffd"
when "'"
tok_state = tok_state_after_attribute_value_quoted
when '&'
- tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
+ tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
when "\u0000"
# Parse error
tok_cur_tag.attrs_a[0][1] += "\ufffd"
when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '&'
- tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
+ tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
when '>'
tok_state = tok_state_data
tmp = tok_cur_tag
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
# returns a string (NOT a text node)
- tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+ parse_character_reference = (allowed_char = null, in_attr = false) ->
if cur >= txt.length
return '&'
switch c = txt.charAt(cur)
# see comments on TYPE_TAG/etc for the structure of this data
tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
open_els = [tree]
+ afe = [] # active formatting elements
+ template_insertion_modes = []
insertion_mode = ins_mode_in_body
+ original_insertion_mode = insertion_mode # TODO check spec
+ flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
flag_frameset_ok = true
flag_parsing = true
flag_foster_parenting = false
form_element_pointer = null
- afe = [] # active formatting elements
+ temporary_buffer = null
# tokenizer initialization
tok_state = tok_state_data
prev_node_id = 0 # reset counter
parsed = parse_html args.html, errors_cb
serialized = serialize_els parsed, false, false
- if serialized isnt args.expected # or parse_errors.length isnt args.errors
+ if serialized isnt args.expected
debug_log_each (str) ->
console.log str
console.log "FAILED: \"#{args.name}\""
- else
- console.log "passed \"#{args.name}\""
- if serialized isnt args.expected
console.log " Input: #{args.html}"
console.log " Correct: #{args.expected}"
console.log " Output: #{serialized}"
- if parse_errors.length isnt args.errors
- console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+ if parse_errors.length > 0
+ console.log " parse errs: #{JSON.stringify parse_errors}"
+ else
+ console.log " No parse errors"
+ else
+ console.log "passed \"#{args.name}\""
test_parser name: "empty", \
html: "",
- expected: '',
- errors: 0
+ expected: ''
test_parser name: "just text", \
html: "abc",
- expected: 'text:"abc"',
- errors: 0
+ expected: 'text:"abc"'
test_parser name: "named entity", \
html: "a&1234",
- expected: 'text:"a&1234"',
- errors: 0
+ expected: 'text:"a&1234"'
test_parser name: "broken named character references", \
html: "1&2&&3&aabbcc;",
- expected: 'text:"1&2&&3&aabbcc;"',
- errors: 2
+ expected: 'text:"1&2&&3&aabbcc;"'
test_parser name: "numbered entity overrides", \
html: "1€€ ƒ",
- expected: 'text:"1€€ ƒ"',
- errors: 0
+ expected: 'text:"1€€ ƒ"'
test_parser name: "open tag", \
html: "foo<span>bar",
- expected: 'text:"foo",tag:"span",{},[text:"bar"]',
- errors: 1 # no close tag
+ expected: 'text:"foo",tag:"span",{},[text:"bar"]'
test_parser name: "open tag with attributes", \
html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
- expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
- errors: 1 # no close tag
+ expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
test_parser name: "open tag with attributes of various quotings", \
html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
- expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
- errors: 1 # no close tag
+ expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
test_parser name: "attribute entity exceptions dq", \
html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
- expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
- errors: 2 # no close tag, &= in attr
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
test_parser name: "attribute entity exceptions sq", \
html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
- expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
- errors: 2 # no close tag, &= in attr
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
test_parser name: "attribute entity exceptions uq", \
html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
- expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
- errors: 2 # no close tag, &= in attr
+ expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
test_parser name: "matching closing tags", \
html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
- expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
- errors: 0
+ expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
test_parser name: "missing closing tag inside", \
html: "foo<div>bar<span>baz</div>qux",
- expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
- errors: 1 # close tag mismatch
+ expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
test_parser name: "mis-matched closing tags", \
html: "<span>12<div>34</span>56</div>78",
- expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
- errors: 2 # misplaced </span>, no </span> at the end
+ expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
test_parser name: "mis-matched formatting elements", \
html: "12<b>34<i>56</b>78</i>90",
- expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
- errors: 1 # no idea how many their should be
+ expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
html: '<p>1<b>2<i>3</b>4</i>5</p>',
- expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
- errors: 1
+ expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
html: '<b>1<p>2</b>3</p>',
- expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
- errors: 1
+ expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
test_parser name: "crazy formatting elements test", \
html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
# chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
# firefox does this:
- expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
- errors: 6 # no idea how many there should be
+ expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
# tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
test_parser name: "html5lib aaa 1", \
html: '<a><p></a></p>',
- expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
- errors: 2
+ expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
test_parser name: "html5lib aaa 2", \
html: '<a>1<p>2</a>3</p>',
- expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
- errors: 2
+ expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
test_parser name: "html5lib aaa 3", \
html: '<a>1<button>2</a>3</button>',
- expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
- errors: 2
+ expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
test_parser name: "html5lib aaa 4", \
html: '<a>1<b>2</a>3</b>',
- expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
- errors: 2
+ expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
test_parser name: "html5lib aaa 5 (two divs deep)", \
html: '<a>1<div>2<div>3</a>4</div>5</div>',
- expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',
- errors: 3
+ expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
test_parser name: "html5lib aaa 6 (foster parenting)", \
html: '<table><a>1<p>2</a>3</p>',
- expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]',
- errors: 10
+ expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
+test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
+ html: '<b><b><a><p></a>',
+ expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
+test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
+ html: '<b><a><b><p></a>',
+ expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
+test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
+ html: '<a><b><b><p></a>',
+ expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
+test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
+ html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
+ expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
html: '<table><a>1<td>2</td>3</table>',
- expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]',
- errors: 10
+ expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
+test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
+ html: '<table>A<td>B</td>C</table>',
+ expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
+# TODO implement svg and namespacing
+#test_parser name: "html5lib aaa 13 (svg tr input)", \
+# html: '<a><svg><tr><input></a>',
+# expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
+test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
+ html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
+ expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
+test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
+ html: '<div><a><b><u><i><code><div></a>',
+ expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
+test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
+ html: '<b><b><b><b>x</b></b></b></b>y',
+ expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
+test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
+ html: '<p><b><b><b><b><p>x',
+ expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
+test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
+ html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
+ expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
+test_parser name: "junk after attribute close-quote", \
+ html: '<p><b c="d", e="f">foo<p>x',
+ expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
+test_parser name: "html5lib aaa02 1", \
+ html: '<b>1<i>2<p>3</b>4',
+ expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
+test_parser name: "html5lib aaa02 2", \
+ html: '<a><div><style></style><address><a>',
+ expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'