new_text_node = (txt) ->
return new Node TYPE_TEXT, text: txt
new_character_token = new_text_node
-new_comment_node = (txt) ->
+new_comment_token = (txt) ->
return new Node TYPE_COMMENT, text: txt
+new_doctype_token = (name) ->
+ return new Node TYPE_DOCTYPE, name: name
new_eof_token = ->
return new Node TYPE_EOF
new_afe_marker = ->
pending_table_character_tokens = null
head_element_pointer = null
flag_fragment_parsing = null
+ context_element = null
stop_parsing = ->
flag_parsing = false
node = open_els[node_i]
# 19. Return to the step labeled loop.
+ # 8.2.3.2
+
+ # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
+ adjusted_current_node = ->
+ if open_els.length is 1 and flag_fragment_parsing
+ return context_element
+ return open_els[0]
+
# http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
# this implementation is structured (mostly) as described at the link above.
# capitalized comments are the "labels" described at the link above.
doc.children.push t
return
if t.type is TYPE_DOCTYPE
+ # FIXME check identifiers, set quirks, etc
# fixfull
- t.name = 'html'
doc.children.push t
insertion_mode = ins_mode_before_html
return
# fixfull encoding stuff
return
if t.type is TYPE_START_TAG and t.name is 'title'
- parse_generic_rcdata_element t
+ parse_generic_rcdata_text t
return
if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
parse_generic_raw_text t
if t.type is TYPE_START_TAG and t.name is 'script'
ail = adjusted_insertion_location()
el = token_to_element t, NS_HTML, ail
- el.flag_parser_inserted true # FIXME implement
+ el.flag 'parser-inserted', true # FIXME implement
# fixfull frament case
ail[0].children.splice ail[1], 0, el
open_els.unshift el
parse_error()
return
ins_mode_in_head_else t
-
+
# 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
ins_mode_in_head_noscript = (t) ->
# FIXME ?fixfull
console.log "ins_mode_in_head_noscript unimplemented"
-
+
# 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
ins_mode_after_head_else = (t) ->
body_tok = new_open_tag 'body'
tok_state = tok_state_end_tag_open
when '?'
parse_error()
+ tok_cur_tag = new_comment_token '?'
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
tok_state = tok_state_tag_name
else
parse_error()
+ tok_cur_tag = new_comment_token '/'
tok_state = tok_state_bogus_comment
return null
cur -= 1 # we didn't handle that char
return null
+ # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
+ # WARNING: put a comment token in tok_cur_tag before setting this state
+ tok_state_bogus_comment = ->
+ next_gt = txt.indexOf '>', cur
+ if next_gt is -1
+ val = txt.substr cur
+ cur = txt.length
+ else
+ val = txt.substr cur, (next_gt - cur)
+ cur = next_gt + 1
+ val = val.replace "\u0000", "\ufffd"
+ tok_cur_tag.text += val
+ tok_state = tok_state_data
+ return tok_cur_tag
+
+ # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
+ tok_state_markup_declaration_open = ->
+ if txt.substr(cur, 2) is '--'
+ cur += 2
+ tok_cur_tag = new_comment_token ''
+ tok_state = tok_state_comment_start
+ return
+ if txt.substr(cur, 7).toLowerCase() is 'doctype'
+ cur += 7
+ tok_state = tok_state_doctype
+ return
+ acn = adjusted_current_node()
+ if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
+ cur += 7
+ tok_state = tok_state_cdata_section
+ return
+ # Otherwise
+ parse_error()
+ tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
+ tok_state = tok_state_bogus_comment
+ return
+
+ # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
+ tok_state_comment_start = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_start_dash
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '>'
+ parse_error()
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += c
+ return null
+
+ # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
+ tok_state_comment_start_dash = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_end
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "-\ufffd"
+ tok_state = tok_state_comment
+ when '>'
+ parse_error()
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += "-#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
+ tok_state_comment = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_end_dash
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += c
+ return null
+
+ # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
+ tok_state_comment_end_dash = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_end
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "-\ufffd"
+ tok_state = tok_state_comment
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += "-#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
+ tok_state_comment_end = ->
+ switch c = txt.charAt(cur++)
+ when '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "--\ufffd"
+ tok_state = tok_state_comment
+ when '!'
+ parse_error()
+ tok_state = tok_state_comment_end_bang
+ when '-'
+ parse_error()
+ tok_cur_tag.text += '-'
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ parse_error()
+ tok_cur_tag.text += "--#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
+ tok_state_comment_end_bang = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_cur_tag.text += "--!#{c}"
+ tok_state = tok_state_comment_end_dash
+ when '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "--!\ufffd"
+ tok_state = tok_state_comment
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += "--!#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+ tok_state_doctype = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\u000a", "\u000c", ' '
+ tok_state = tok_state_before_doctype_name
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ el = new_doctype_token ''
+ el.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return el
+ else
+ parse_error()
+ tok_state = tok_state_before_doctype_name
+ cur -= 1 # Reconsume
+ return null
+
+ # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+ tok_state_before_doctype_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_doctype_token c.toLowerCase()
+ tok_state = tok_state_doctype_name
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag = new_doctype_token "\ufffd"
+ tok_state = tok_state_doctype_name
+ return
+ if c is '>'
+ parse_error()
+ el = new_doctype_token ''
+ el.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return el
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ el = new_doctype_token ''
+ el.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return el
+ # Anything else
+ tok_cur_tag = new_doctype_token c
+ tok_state = tok_state_doctype_name
+ return null
+
+ # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
+ tok_state_doctype_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_after_doctype_name
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c.toLowerCase()
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.name += "\ufffd"
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.name += c
+ return null
+
+ # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
+ tok_state_after_doctype_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ if txt.substr(cur - 1, 6).toLowerCase() is 'public'
+ cur += 5
+ tok_state = tok_state_after_doctype_public_keyword
+ return
+ if txt.substr(cur - 1, 6).toLowerCase() is 'system'
+ cur += 5
+ tok_state = tok_state_after_doctype_system_keyword
+ return
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
+ tok_state_after_doctype_public_keyword = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_before_doctype_public_identifier
+ return
+ if c is '"'
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
+ tok_state_before_doctype_public_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '"'
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+
+ # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
+ tok_state_doctype_public_identifier_double_quoted = ->
+ c = txt.charAt(cur++)
+ if c is '"'
+ tok_state = tok_state_after_doctype_public_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.public_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.public_identifier += c
+ return null
+
+ # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
+ tok_state_doctype_public_identifier_single_quoted = ->
+ c = txt.charAt(cur++)
+ if c is "'"
+ tok_state = tok_state_after_doctype_public_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.public_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.public_identifier += c
+ return null
+
+ # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
+ tok_state_after_doctype_public_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_between_doctype_public_and_system_identifiers
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '"'
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
+ tok_state_between_doctype_public_and_system_identifiers = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '"'
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
+ tok_state_after_doctype_system_keyword = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_before_doctype_system_identifier
+ return
+ if c is '"'
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
+ tok_state_before_doctype_system_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '"'
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
+ tok_state_doctype_system_identifier_double_quoted = ->
+ c = txt.charAt(cur++)
+ if c is '"'
+ tok_state = tok_state_after_doctype_system_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.system_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.system_identifier += c
+ return null
+
+ # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
+ tok_state_doctype_system_identifier_single_quoted = ->
+ c = txt.charAt(cur++)
+ if c is "'"
+ tok_state = tok_state_after_doctype_system_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.system_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.system_identifier += c
+ return null
+
+ # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
+ tok_state_after_doctype_system_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ # do _not_ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
+ tok_state_bogus_doctype = ->
+ c = txt.charAt(cur++)
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ return null
+
+
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
# returns a string (NOT a text node)
pending_table_character_tokens = []
head_element_pointer = null
flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
+ context_element = null # FIXME initialize from args.fragment
# tokenizer initialization
tok_state = tok_state_data