return null if decoded is txt
return g_dncr.cache[txt] = decoded
-parse_html = (txt, parse_error_cb = null) ->
- cur = 0 # index of next char in txt to be parsed
+parse_html = (args) ->
+ txt = null
+ cur = null # index of next char in txt to be parsed
# declare doc and tokenizer variables so they're in scope below
doc = null
open_els = null # stack of open elements
flag_parsing = false
parse_error = ->
- if parse_error_cb?
- parse_error_cb cur
+ if args.error_cb?
+ args.error_cb cur
else
console.log "Parse error at character #{cur} of #{txt.length}"
is_appropriate_end_tag = (t) ->
# spec says to check against "the tag name of the last start tag to
# have been emitted from this tokenizer", but this is only called from
- # the various "raw" states, which I'm pretty sure all push the start
- # token onto open_els. TODO: verify this after the script data states
- # are implemented
+ # the various "raw" states, so it's hopefully ok to assume that
+ # open_els[0].name will work instead TODO: verify this after the script
+ # data states are implemented
debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
return t.type is TYPE_END_TAG and t.name is open_els[0].name
tok_state = tok_state_self_closing_start_tag
return
# fall through
+ if c is '>'
+ if is_appropriate_end_tag tok_cur_tag
+ tok_state = tok_state_data
+ return tok_cur_tag
+ # fall through
if is_uc_alpha(c)
tok_cur_tag.name += c.toLowerCase()
temporary_buffer += c
return tmp
when "\u0000"
parse_error()
- tok_cur_tag.attrs_a[0][0] = "\ufffd"
+ tok_cur_tag.attrs_a[0][0] += "\ufffd"
when '"', "'", '<'
parse_error()
- tok_cur_tag.attrs_a[0][0] = c
+ tok_cur_tag.attrs_a[0][0] += c
when '' # EOF
parse_error()
tok_state = tok_state_data
else
if is_uc_alpha(c)
- tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
+ tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
else
tok_cur_tag.attrs_a[0][0] += c
return null
else
val = txt.substr cur, (next_gt - cur)
cur = next_gt + 3
- val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
+ val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+ val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+ val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
return new_character_token val # fixfull split
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
+ txt = args.html
+ cur = 0
doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
open_els = []
afe = [] # active formatting elements
template_ins_modes = []
ins_mode = ins_mode_initial
original_ins_mode = ins_mode # TODO check spec
- flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
+ flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
flag_frameset_ok = true
flag_parsing = true
flag_foster_parenting = false
}, {
name: "webkit02.dat #2"
html: "<p id=\"status\"><noscript><strong>A</strong></noscript><span>B</span></p>"
+ scripting: true
errors: 1
expected: "| <html>\n| <head>\n| <body>\n| <p>\n| id=\"status\"\n| <noscript>\n| \"<strong>A</strong>\"\n| <span>\n| \"B\"\n"
}, {
name: "webkit02.dat #3"
html: "<p id=\"status\"><noscript><strong>A</strong></noscript><span>B</span></p>"
+ scripting: false
errors: 1
expected: "| <html>\n| <head>\n| <body>\n| <p>\n| id=\"status\"\n| <noscript>\n| <strong>\n| \"A\"\n| <span>\n| \"B\"\n"
}, {
return
wheic.debug_log_reset()
parse_errors = []
- errors_cb = (i) ->
+ args.error_cb = (i) ->
parse_errors.push i
prev_node_id = 0 # reset counter
- parsed = wheic.parse_html args.html, errors_cb
+ parsed = wheic.parse_html args
serialized = serialize_els parsed
if serialized isnt args.expected
#wheic.debug_log_each (str) ->