alnum = lc_alpha + uc_alpha + digits
hex_chars = digits + "abcdefABCDEF"
+is_uc_alpha = (str) ->
+ return str.length is 1 and uc_alpha.indexOf(str) > -1
+is_lc_alpha = (str) ->
+ return str.length is 1 and lc_alpha.indexOf(str) > -1
+
# some SVG elements have dashes in them
tag_name_chars = alnum + "-"
is_space_tok = (t) ->
return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
+is_input_hidden_tok = (t) ->
+ return unless t.type is TYPE_START_TAG
+ for a of t.attrs_a
+ if a[0] is 'type'
+ if a[1].toLowerCase() is 'hidden'
+ return true
+ return false
+ return false
+
# https://en.wikipedia.org/wiki/Whitespace_character#Unicode
whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
for t in open_els
if t.name is tag_name and (namespace is null or namespace is t.namespace)
return true
- if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
+ if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
return false
return false
# this checks for a particular element, not by name
return
clear_afe_to_marker = ->
loop
+ return unless afe.length > 0 # this happens in fragment case, ?spec error
el = afe.shift()
if el.type is TYPE_AFE_MARKER
return
+ return
# 8.2.3.1 ...
# http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
# last template's template contents, after its last child (if
# any), and abort these substeps.
if last_template and (last_table is null or last_template_i < last_table_i)
- target = template # fixfull should be it's contents
+ target = last_template # fixfull should be it's contents
target_i = target.children.length
break
# 4. If there is no last table, then let adjusted insertion
return
if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
insert_html_element t
- insertion_mode = in_head_noscript # FIXME implement
+ insertion_mode = ins_mode_in_head_noscript # FIXME implement
return
if t.type is TYPE_START_TAG and t.name is 'script'
ail = adjusted_insertion_location()
when 'style', 'script', 'template'
ins_mode_in_head t
when 'input'
- if token_is_input_hidden t
+ if is_input_hidden_tok t
ins_mode_in_table_else t
else
parse_error()
if el.name is 'caption'
break
clear_afe_to_marker()
- insertion_mode = in_table
+ insertion_mode = ins_mode_in_table
else
parse_error()
# fragment case
if el.name is 'caption'
break
clear_afe_to_marker()
- insertion_mode = in_table
+ insertion_mode = ins_mode_in_table
insertion_mode t
# else fragment case
return
return
if t.type is TYPE_END_TAG and t.name is 'colgroup'
if open_els[0].name is 'colgroup'
- open_els[0].shift()
+ open_els.shift()
insertion_mode = ins_mode_in_table
else
parse_error()
if t.type is TYPE_END_TAG
parse_error()
return
- if t.type is EOF
+ if t.type is TYPE_EOF
unless template_tag_is_open()
stop_parsing()
return
open_els.shift()
t.acknowledge_self_closing()
return
- if t.type is TYPE_START TAG and t.name is 'noframes'
+ if t.type is TYPE_START_TAG and t.name is 'noframes'
ins_mode_in_head t
return
if t.type is TYPE_EOF
tok_cur_tag = new_comment_token '?'
tok_state = tok_state_bogus_comment
else
- if lc_alpha.indexOf(c) > -1
+ if is_lc_alpha(c)
tok_cur_tag = new_open_tag c
tok_state = tok_state_tag_name
- else if uc_alpha.indexOf(c) > -1
+ else if is_uc_alpha(c)
tok_cur_tag = new_open_tag c.toLowerCase()
tok_state = tok_state_tag_name
else
tok_state = tok_state_data
return new_text_node '</'
else
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag = new_end_tag c.toLowerCase()
tok_state = tok_state_tag_name
- else if lc_alpha.indexOf(c) > -1
+ else if is_lc_alpha(c)
tok_cur_tag = new_end_tag c
tok_state = tok_state_tag_name
else
parse_error()
tok_state = tok_state_data
else
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag.name += c.toLowerCase()
else
tok_cur_tag.name += c
# 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
tok_state_rcdata_end_tag_open = ->
c = txt.charAt(cur++)
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag = new_end_tag c.toLowerCase()
temporary_buffer += c
tok_state = tok_state_rcdata_end_tag_name
return null
- if lc_alpha.indexOf(c) > -1
+ if is_lc_alpha(c)
tok_cur_tag = new_end_tag c
temporary_buffer += c
tok_state = tok_state_rcdata_end_tag_name
tok_state = tok_state_data
return tok_cur_tag
# else fall through to "Anything else"
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag.name += c.toLowerCase()
temporary_buffer += c
return null
- if lc_alpha.indexOf(c) > -1
+ if is_lc_alpha(c)
tok_cur_tag.name += c
temporary_buffer += c
return null
# 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
tok_state_rawtext_end_tag_open = ->
c = txt.charAt(cur++)
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag = new_end_tag c.toLowerCase()
temporary_buffer += c
tok_state = tok_state_rawtext_end_tag_name
return null
- if lc_alpha.indexOf(c) > -1
+ if is_lc_alpha(c)
tok_cur_tag = new_end_tag c
temporary_buffer += c
tok_state = tok_state_rawtext_end_tag_name
tok_state = tok_state_data
return tok_cur_tag
# else fall through to "Anything else"
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag.name += c.toLowerCase()
temporary_buffer += c
return null
- if lc_alpha.indexOf(c) > -1
+ if is_lc_alpha(c)
tok_cur_tag.name += c
temporary_buffer += c
return null
cur -= 1 # reconsume the input character
return new_character_token '</' + temporary_buffer # fixfull separate these
- # TODO _all_ of the missing states here (17-33) are for parsing script tags
+ # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
+ tok_state_script_data_less_than_sign = ->
+ c = txt.charAt(cur++)
+ if c is '/'
+ temporary_buffer = ''
+ tok_state = tok_state_script_data_end_tag_open
+ return
+ if c is '!'
+ tok_state = tok_state_script_data_escape_start
+ return new_character_token '<!' # fixfull split
+ # Anything else
+ tok_state = tok_state_script_data
+ cur -= 1 # Reconsume
+ return new_character_token '<'
+
+ # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+ tok_state_script_data_end_tag_open = ->
+ c = txt.charAt(cur++)
+ if is_uc_alpha(c)
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ temporary_buffer += c
+ tok_state = tok_state_script_data_end_tag_name
+ return
+ if is_lc_alpha(c)
+ tok_cur_tag = new_end_tag c
+ temporary_buffer += c
+ tok_state = tok_state_script_data_end_tag_name
+ return
+ # Anything else
+ tok_state = tok_state_script_data
+ cur -= 1 # Reconsume
+ return new_character_token '</'
+
+ # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+ tok_state_script_data_end_tag_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_before_attribute_name
+ return
+ # fall through
+ if c is '/'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_self_closing_start_tag
+ return
+ # fall through
+ if is_uc_alpha(c)
+ tok_cur_tag.name += c.toLowerCase()
+ temporary_buffer += c
+ return
+ if is_lc_alpha(c)
+ tok_cur_tag.name += c
+ temporary_buffer += c
+ return
+ # Anything else
+ tok_state = tok_state_script_data
+ cur -= 1 # Reconsume
+ return new_character_token "</#{temporary_buffer}" # fixfull split
+
+ # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
+ tok_state_script_data_escape_start = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ tok_state = tok_state_script_data_escape_start_dash
+ return new_character_token '-'
+ # Anything else
+ tok_state = tok_state_script_data
+ cur -= 1 # Reconsume
+ return
+
+ # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
+ tok_state_script_data_escape_start_dash = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ tok_state = tok_state_script_data_escaped_dash_dash
+ return new_character_token '-'
+ # Anything else
+ tok_state = tok_state_script_data
+ cur -= 1 # Reconsume
+ return
+
+ # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
+ tok_state_script_data_escaped = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ tok_state = tok_state_script_data_escaped_dash
+ return new_character_token '-'
+ if c is '<'
+ tok_state = tok_state_script_data_escaped_less_than_sign
+ return
+ if c is "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ if c is '' # EOF
+ tok_state = tok_state_data
+ parse_error()
+ cur -= 1 # Reconsume
+ return
+ # Anything else
+ return new_character_token c
+
+ # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
+ tok_state_script_data_escaped_dash = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ tok_state = tok_state_script_data_escaped_dash_dash
+ return new_character_token '-'
+ if c is '<'
+ tok_state = tok_state_script_data_escaped_less_than_sign
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_state = tok_state_script_data_escaped
+ return new_character_token "\ufffd"
+ if c is '' # EOF
+ tok_state = tok_state_data
+ parse_error()
+ cur -= 1 # Reconsume
+ return
+ # Anything else
+ tok_state = tok_state_script_data_escaped
+ return new_character_token c
+
+ # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
+ tok_state_script_data_escaped_dash_dash = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ return new_character_token '-'
+ if c is '<'
+ tok_state = tok_state_script_data_escaped_less_than_sign
+ return
+ if c is '>'
+ tok_state = tok_state_script_data
+ return new_character_token '>'
+ if c is "\u0000"
+ parse_error()
+ tok_state = tok_state_script_data_escaped
+ return new_character_token "\ufffd"
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return
+ # Anything else
+ tok_state = tok_state_script_data_escaped
+ return new_character_token c
+
+ # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
+ tok_state_script_data_escaped_less_than_sign = ->
+ c = txt.charAt(cur++)
+ if c is '/'
+ temporary_buffer = ''
+ tok_state = tok_state_script_data_escaped_end_tag_open
+ return
+ if is_uc_alpha(c)
+ temporary_buffer = c.toLowerCase() # yes, really
+ tok_state = tok_state_script_data_double_escape_start
+ return new_character_token "<#{c}" # fixfull split
+ if is_lc_alpha(c)
+ temporary_buffer = c
+ tok_state = tok_state_script_data_double_escape_start
+ return new_character_token "<#{c}" # fixfull split
+ # Anything else
+ tok_state = tok_state_script_data_escaped
+ cur -= 1 # Reconsume
+ return new_character_token c
+
+ # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
+ tok_state_script_data_escaped_end_tag_open = ->
+ c = txt.charAt(cur++)
+ if is_uc_alpha(c)
+ tok_cur_tag = new_end_tag c.toLowerCase()
+ temporary_buffer += c
+ tok_state = tok_state_script_data_escaped_end_tag_name
+ return
+ if is_lc_alpha(c)
+ tok_cur_tag = new_end_tag c
+ temporary_buffer += c
+ tok_state = tok_state_script_data_escaped_end_tag_name
+ return
+ # Anything else
+ tok_state = tok_state_script_data_escaped
+ cur -= 1 # Reconsume
+ return new_character_token '</' # fixfull split
+
+ # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
+ tok_state_script_data_escaped_end_tag_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_before_attribute_name
+ return
+ # fall through
+ if c is '/'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_self_closing_start_tag
+ return
+ # fall through
+ if is_uc_alpha(c)
+ tok_cur_tag.name += c.toLowerCase()
+ temporary_buffer += c.toLowerCase()
+ return
+ if is_lc_alpha(c)
+ tok_cur_tag.name += c
+ temporary_buffer += c.toLowerCase()
+ return
+ # Anything else
+ tok_state = tok_state_script_data_escaped
+ cur -= 1 # Reconsume
+ return new_character_token "</#{temporary_buffer}" # fixfull split
+
+ # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
+ tok_state_script_data_double_escape_start = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+ if temporary_buffer is 'script'
+ tok_state = tok_state_script_data_double_escaped
+ else
+ tok_state = tok_state_script_data_escaped
+ return new_character_token c
+ if is_uc_alpha(c)
+ temporary_buffer += c.toLowerCase() # yes, really lowercase
+ return new_character_token c
+ if is_lc_alpha(c)
+ temporary_buffer += c
+ return new_character_token c
+ # Anything else
+ tok_state = tok_state_script_data_escaped
+ cur -= 1 # Reconsume
+ return
+
+ # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
+ tok_state_script_data_double_escaped = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ tok_state = tok_state_script_data_double_escaped_dash
+ return new_character_token '-'
+ if c is '<'
+ tok_state = tok_state_script_data_double_escaped_less_than_sign
+ return new_character_token '<'
+ if c is "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return
+ # Anything else
+ return new_character_token c
+
+ # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
+ tok_state_script_data_double_escaped_dash = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ tok_state = tok_state_script_data_double_escaped_dash_dash
+ return new_character_token '-'
+ if c is '<'
+ tok_state = tok_state_script_data_double_escaped_less_than_sign
+ return new_character_token '<'
+ if c is "\u0000"
+ parse_error()
+ tok_state = tok_state_script_data_double_escaped
+ return new_character_token "\ufffd"
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return
+ # Anything else
+ tok_state = tok_state_script_data_double_escaped
+ return new_character_token c
+
+ # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
+ tok_state_script_data_double_escaped_dash_dash = ->
+ c = txt.charAt(cur++)
+ if c is '-'
+ return new_character_token '-'
+ if c is '<'
+ tok_state = tok_state_script_data_double_escaped_less_than_sign
+ return new_character_token '<'
+ if c is '>'
+ tok_state = tok_state_script_data
+ return new_character_token '>'
+ if c is "\u0000"
+ parse_error()
+ tok_state = tok_state_script_data_double_escaped
+ return new_character_token "\ufffd"
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return
+ # Anything else
+ tok_state = tok_state_script_data_double_escaped
+ return new_character_token c
+
+ # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
+ tok_state_script_data_double_escaped_less_than_sign = ->
+ c = txt.charAt(cur++)
+ if c is '/'
+ temporary_buffer = ''
+ tok_state = tok_state_script_data_double_escape_end
+ return new_character_token '/'
+ # Anything else
+ tok_state = tok_state_script_data_double_escaped
+ cur -= 1 # Reconsume
+ return
+
+ # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
+ tok_state_script_data_double_escape_end = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+ if temporary_buffer is 'script'
+ tok_state = tok_state_script_data_escaped
+ else
+ tok_state = tok_state_script_data_double_escaped
+ return new_character_token c
+ if is_uc_alpha(c)
+ temporary_buffer += c.toLowerCase() # yes, really lowercase
+ return new_character_token c
+ if is_lc_alpha(c)
+ temporary_buffer += c
+ return new_character_token c
+ # Anything else
+ tok_state = tok_state_script_data_double_escaped
+ cur -= 1 # Reconsume
+ return
# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
tok_state_before_attribute_name = ->
parse_error()
tok_state = tok_state_data
else
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
attr_name = c.toLowerCase()
else
attr_name = c
parse_error()
tok_state = tok_state_data
else
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
else
tok_cur_tag.attrs_a[0][0] += c
if c is '>'
tok_state = tok_state_data
return
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
tok_state = tok_state_attribute_name
return
cur -= 1 # we didn't handle that char
return null
+ # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
+ tok_state_self_closing_start_tag = ->
+ c = txt.charAt(cur++)
+ if c is '>'
+ tok_cur_tag.flag 'self-closing'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is ''
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return
+ # Anything else
+ parse_error()
+ tok_state = tok_state_before_attribute_name
+ cur -= 1 # Reconsume
+ return
+
# 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
# WARNING: put a comment token in tok_cur_tag before setting this state
tok_state_bogus_comment = ->
tok_state = tok_state_doctype
return
acn = adjusted_current_node()
- if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
+ if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
cur += 7
tok_state = tok_state_cdata_section
return
c = txt.charAt(cur++)
if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
return
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag = new_doctype_token c.toLowerCase()
tok_state = tok_state_doctype_name
return
if c is '>'
tok_state = tok_state_data
return tok_cur_tag
- if uc_alpha.indexOf(c) > -1
+ if is_uc_alpha(c)
tok_cur_tag.name += c.toLowerCase()
return
if c is "\u0000"
pending_table_character_tokens = []
head_element_pointer = null
flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
- context_element = null # FIXME initialize from args.fragment
+ context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
# tokenizer initialization
tok_state = tok_state_data