else
console.log "Parse error at character #{cur} of #{txt.length}"
+ # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
+ # "Noah's Ark clause" but with three
afe_push = (new_el) ->
matches = 0
for el, i in afe
+ if el.type is TYPE_AFE_MARKER
+ break
if el.name is new_el.name and el.namespace is new_el.namespace
+ attrs_match = true
for k, v of el.attrs
- continue unless new_el.attrs[k] is v
- for k, v of new_el.attrs
- continue unless el.attrs[k] is v
- matches += 1
- if matches is 3
- afe.splice i, 1
- break
+ unless new_el.attrs[k] is v
+ attrs_match = false
+ break
+ if attrs_match
+ for k, v of new_el.attrs
+ unless el.attrs[k] is v
+ attrs_match = false
+ break
+ if attrs_match
+ matches += 1
+ if matches is 3
+ afe.splice i, 1
+ break
afe.unshift new_el
afe_push_marker = ->
afe.unshift new_afe_marker()
loop
if node_i is open_els.length - 1
last = true
- # fixfull (fragment case)
-
+ if flag_fragment_parsing
+ node = context_element
# 4. If node is a select element, run these substeps:
if node.name is 'select' and node.namespace is NS_HTML
# 1. If last is true, jump to the step below labeled done.
if t.type is TYPE_START_TAG and t.name is 'html'
el = token_to_element t, NS_HTML, doc
doc.children.push el
+ el.document = doc
open_els.unshift(el)
# fixfull (big paragraph in spec about manifest, fragment, urls, etc)
ins_mode = ins_mode_before_head
# Anything else
el = token_to_element new_open_tag('html'), NS_HTML, doc
doc.children.push el
- el.parent = doc
+ el.document = doc
open_els.unshift el
# ?fixfull browsing context
ins_mode = ins_mode_before_head
# 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
- for el, i in open_els
- if el.name is name and el.namespace is NS_HTML
+ node = open_els[0]
+ loop
+ if node.name is name and node.namespace is NS_HTML
generate_implied_end_tags name # arg is exception
- parse_error() unless i is 0
- while i >= 0
- open_els.shift()
- i -= 1
- return
- if special_elements[el.name] is el.namespace
+ unless node is open_els[0]
+ parse_error()
+ loop
+ el = open_els.shift()
+ if el is node
+ return
+ if special_elements[node.name] is node.namespace
parse_error()
return
+ for el, i in open_els
+ if node is el
+ node = open_els[i + 1]
+ break
return
ins_mode_in_body = (t) ->
if t.type is TYPE_TEXT and t.text is "\u0000"
if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
close_p_if_in_button_scope()
insert_html_element t
- # spec: If the next token is a "LF" (U+000A) character token, then
- # ignore that token and move on to the next one. (Newlines at the
- # start of pre blocks are ignored as an authoring convenience.)
- if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
- cur += 1
+ eat_next_token_if_newline()
flag_frameset_ok = false
return
if t.type is TYPE_START_TAG and t.name is 'form'
return
if t.type is TYPE_START_TAG and t.name is 'nobr'
reconstruct_afe()
+ if is_in_scope 'nobr', NS_HTML
+ parse_error()
+ adoption_agency 'nobr'
+ reconstruct_afe()
el = insert_html_element t
afe_push el
return
return
if t.type is TYPE_END_TAG and t.name is 'br'
parse_error()
- t.type = TYPE_START_TAG
+ # W3C: t.type = TYPE_START_TAG
+ t = new_open_tag 'br' # WHATWG
# fall through
if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
reconstruct_afe()
unless is_input_hidden_tok t
flag_frameset_ok = false
return
- if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
+ if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
+ # WHATWG adds 'menuitem' for this block
insert_html_element t
open_els.shift()
t.acknowledge_self_closing()
return
if t.type is TYPE_START_TAG and t.name is 'textarea'
insert_html_element t
- if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
- cur += 1
+ eat_next_token_if_newline()
tok_state = tok_state_rcdata
original_ins_mode = ins_mode
flag_frameset_ok = false
insert_html_element t
return
if t.type is TYPE_END_TAG and t.name is 'optgroup'
- if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
+ if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
open_els.shift()
if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
return
if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
parse_error()
- if is_in_select_scope 'select', NS_HTML
+ unless is_in_select_scope 'select', NS_HTML
return
loop
el = open_els.shift()
tok_state = tok_state_tag_open
when "\u0000"
parse_error()
- return new_text_node "\ufffd"
+ return new_text_node c
when '' # EOF
return new_eof_token()
else
return
if c is '>'
tok_state = tok_state_data
- return
+ return tok_cur_tag
if is_uc_alpha(c)
tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
tok_state = tok_state_attribute_name
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 3
- return new_character_token val # fixfull split
+ val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
+ if val.length > 0
+ return new_character_token val # fixfull split
+ return null
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
return '&'
return # never reached
+ eat_next_token_if_newline = ->
+ old_cur = cur
+ t = null
+ until t?
+ t = tok_state()
+ if t.type is TYPE_TEXT
+ # definition of a newline depends on whether it was a character ref or not
+ if cur - old_cur is 1
+ # not a character reference
+ if t.text is "\u000d" or t.text is "\u000a"
+ return
+ else
+ if t.text is "\u000a"
+ return
+ # not a "newline"
+ cur = old_cur
+ return
+
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
txt = args.html
cur = 0
- doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+ doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
+ fragment_root = null # fragment parsing algorithm returns children of this
open_els = []
afe = [] # active formatting elements
template_ins_modes = []
temporary_buffer = null
pending_table_character_tokens = []
head_element_pointer = null
- flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
- context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+ flag_fragment_parsing = false
+ context_element = null
prev_node_id = 0 # just for debugging
# tokenizer initialization
tok_state = tok_state_data
- # text pre-processing
- # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
- txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
- txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
- txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+ parse_init = ->
+ # fragment parsing (text arg)
+ if args.fragment?
+ # this handles the fragment from the tests in the format described here:
+ # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
+ f = args.fragment
+ ns = NS_HTML
+ if f.substr(0, 5) is 'math '
+ f = f.substr 5
+ ns = NS_MATHML
+ else if f.substr(0, 4) is 'svg '
+ f = f.substr 4
+ ns = NS_SVG
+ t = new_open_tag f
+ context_element = token_to_element t, ns
+ context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+ context_element.document.flag 'quirks mode', QUIRKS_NO
+ # fragment parsing (Node arg)
+ if args.context?
+ context_element = args.context
+
+ # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+ # fragment parsing algorithm
+ if context_element?
+ flag_fragment_parsing = true
+ doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+ # search up the tree from context, to try to find it's document,
+ # because this file only puts a "document" property on the root
+ # element.
+ old_doc = null
+ el = context_element
+ loop
+ if el.document?
+ old_doc = el.document
+ break
+ if el.parent
+ el = el.parent
+ else
+ break
+ if old_doc
+ doc.flag 'quirks mode', old_doc.flag 'quirks mode'
+ # set tok_state
+ if context_element.namespace is NS_HTML
+ switch context_element.name
+ when 'title', 'textarea'
+ tok_state = tok_state_rcdata
+ when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
+ tok_state = tok_state_rawtext
+ when 'script'
+ tok_state = tok_state_script_data
+ when 'noscript'
+ if flag_scripting
+ tok_state = tok_state_rawtext
+ when 'plaintext'
+ tok_state = tok_state_plaintext
+ fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+ doc.children.push fragment_root
+ fragment_root.document = doc
+ open_els = [fragment_root]
+ if context_element.name is 'template' and context_element.namespace is NS_HTML
+ template_ins_modes.unshift ins_mode_in_template
+ # fixfull create token for context (it should have it's original one already)
+ reset_ins_mode()
+ # set form_element pointer... in the foreign doc?!
+ el = context_element
+ loop
+ if el.name is 'form' and el.namespace is NS_HTML
+ form_element_pointer = el
+ break
+ if el.parent
+ el = el.parent
+ else
+ break
+
+ # text pre-processing
+ # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+ txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+ txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
- if args.name is "tests20.dat #22"
- console.log "hi"
- # proccess input
# http://www.w3.org/TR/html5/syntax.html#tree-construction
- while flag_parsing
- t = tok_state()
- if t?
- process_token t
- # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+ parse_main_loop = ->
+ while flag_parsing
+ t = tok_state()
+ if t?
+ process_token t
+ # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+ return
+ parse_init()
+ parse_main_loop()
+
+ if flag_fragment_parsing
+ return fragment_root.children
return doc.children
serialize_els = (els, shallow, show_ids) ->