return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
is_input_hidden_tok = (t) ->
- return unless t.type is TYPE_START_TAG
- for a of t.attrs_a
+ return false unless t.type is TYPE_START_TAG
+ for a in t.attrs_a
if a[0] is 'type'
if a[1].toLowerCase() is 'hidden'
return true
return null if decoded is txt
return g_dncr.cache[txt] = decoded
-parse_html = (txt, parse_error_cb = null) ->
- cur = 0 # index of next char in txt to be parsed
+parse_html = (args) ->
+ txt = null
+ cur = null # index of next char in txt to be parsed
# declare doc and tokenizer variables so they're in scope below
doc = null
open_els = null # stack of open elements
flag_parsing = false
parse_error = ->
- if parse_error_cb?
- parse_error_cb cur
+ if args.error_cb?
+ args.error_cb cur
else
console.log "Parse error at character #{cur} of #{txt.length}"
last_template = null
last_template_i = null
for el, i in open_els
- if el.name is 'template'
+ if el.name is 'template' and el.namespace is NS_HTML
last_template = el
last_template_i = i
break
last_table = null
last_table_i
for el, i in open_els
- if el.name is 'table'
+ if el.name is 'table' and el.namespace is NS_HTML
last_table = el
last_table_i = i
break
# this is odd
target = open_els[open_els.length - 1]
target_i = target.children.length
+ break
# 5. If last table has a parent element, then let adjusted
# insertion location be inside last table's parent element,
# immediately before last table, and abort these substeps.
el = insert_html_element t
head_element_pointer = el
ins_mode = ins_mode_in_head
+ return
if t.type is TYPE_END_TAG
if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
# fall through to Anything else below
if t.type is TYPE_START_TAG and t.name is 'title'
parse_generic_rcdata_text t
return
- if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+ if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
parse_generic_raw_text t
return
if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
if t.type is TYPE_DOCTYPE
parse_error()
return
- if t.type is TYPE_START_TAG
+ if t.type is TYPE_START_TAG and t.name is 'html'
ins_mode_in_body t
return
if t.type is TYPE_END_TAG and t.name is 'noscript'
open_els.shift()
ins_mode = ins_mode_in_head
return
- if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
+ if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
ins_mode_in_head t
return
if t.type is TYPE_END_TAG and t.name is 'br'
return
return
- ins_mode_in_table_else = (t) ->
- parse_error()
- flag_foster_parenting = true # FIXME
- ins_mode_in_body t
- flag_foster_parenting = false
- can_in_table = { # FIXME do this inline like everywhere else
- 'table': true
- 'tbody': true
- 'tfoot': true
- 'thead': true
- 'tr': true
- }
-
# 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
ins_mode_text = (t) ->
if t.type is TYPE_TEXT
# http://www.w3.org/TR/html5/syntax.html#tokenization
# 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
+ ins_mode_in_table_else = (t) ->
+ parse_error()
+ flag_foster_parenting = true
+ ins_mode_in_body t
+ flag_foster_parenting = false
+ return
+ can_in_table = { # FIXME do this inline like everywhere else
+ 'table': true
+ 'tbody': true
+ 'tfoot': true
+ 'thead': true
+ 'tr': true
+ }
ins_mode_in_table = (t) ->
switch t.type
when TYPE_TEXT
when 'style', 'script', 'template'
ins_mode_in_head t
when 'input'
- if is_input_hidden_tok t
+ unless is_input_hidden_tok t
ins_mode_in_table_else t
else
parse_error()
is_appropriate_end_tag = (t) ->
# spec says to check against "the tag name of the last start tag to
# have been emitted from this tokenizer", but this is only called from
- # the various "raw" states, which I'm pretty sure all push the start
- # token onto open_els. TODO: verify this after the script data states
- # are implemented
+ # the various "raw" states, so it's hopefully ok to assume that
+ # open_els[0].name will work instead TODO: verify this after the script
+ # data states are implemented
debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
return t.type is TYPE_END_TAG and t.name is open_els[0].name
tok_state = tok_state_self_closing_start_tag
return
# fall through
+ if c is '>'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_data
+ return tok_cur_tag
+ # fall through
if is_uc_alpha(c)
tok_cur_tag.name += c.toLowerCase()
temporary_buffer += c
tok_state = tok_state_self_closing_start_tag
return
# fall through
+ if c is '>'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_data
+ return tok_cur_tag
+ # fall through
if is_uc_alpha(c)
tok_cur_tag.name += c.toLowerCase()
temporary_buffer += c.toLowerCase()
return tmp
when "\u0000"
parse_error()
- tok_cur_tag.attrs_a[0][0] = "\ufffd"
+ tok_cur_tag.attrs_a[0][0] += "\ufffd"
when '"', "'", '<'
parse_error()
- tok_cur_tag.attrs_a[0][0] = c
+ tok_cur_tag.attrs_a[0][0] += c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if is_uc_alpha(c)
- tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
+ tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
else
tok_cur_tag.attrs_a[0][0] += c
return null
return
# Otherwise
parse_error()
- tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
+ tok_cur_tag = new_comment_token ''
tok_state = tok_state_bogus_comment
return
tok_state = tok_state_comment_start_dash
when "\u0000"
parse_error()
+ tok_state = tok_state_comment
return new_character_token "\ufffd"
when '>'
parse_error()
return tok_cur_tag
else
tok_cur_tag.text += c
+ tok_state = tok_state_comment
return null
# 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 3
- val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
+ val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+ val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+ val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
return new_character_token val # fixfull split
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
+ txt = args.html
+ cur = 0
doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
open_els = []
afe = [] # active formatting elements
template_ins_modes = []
ins_mode = ins_mode_initial
original_ins_mode = ins_mode # TODO check spec
- flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
+ flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
flag_frameset_ok = true
flag_parsing = true
flag_foster_parenting = false
# tokenizer initialization
tok_state = tok_state_data
+ if args.name is "one_that_breaks #1"
+ throw "hi" # console.log "hi"
# proccess input
# http://www.w3.org/TR/html5/syntax.html#tree-construction
while flag_parsing