X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=85a91af5f83259a3507a98cb586c696ae0439f23;hb=6c4bd699f811de0097387b167097665b57c4b011;hp=c6ed9a5769cdb683f41dbe747f3fd9b82a4033ba;hpb=ffc91832d8b2c91ddd4407cf4036b6fc0eeca928;p=peach-html5-editor.git
diff --git a/parse-html.coffee b/parse-html.coffee
index c6ed9a5..85a91af 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -47,6 +47,12 @@
# 0: a "end of the list", "current node", "bottommost", "last"
+# browser
+# note: to get this to run outside a browser, you'll have to write a native
+# implementation of decode_named_char_ref()
+unless module?.exports?
+ window.wheic = {}
+ module = exports: window.wheic
# Each node is an obect of the Node class. Here are the Node types:
TYPE_TAG = 0 # name, {attributes}, [children]
@@ -158,8 +164,10 @@ new_element = (name) ->
new_text_node = (txt) ->
return new Node TYPE_TEXT, text: txt
new_character_token = new_text_node
-new_comment_node = (txt) ->
+new_comment_token = (txt) ->
return new Node TYPE_COMMENT, text: txt
+new_doctype_token = (name) ->
+ return new Node TYPE_DOCTYPE, name: name
new_eof_token = ->
return new Node TYPE_EOF
new_afe_marker = ->
@@ -367,6 +375,7 @@ parse_html = (txt, parse_error_cb = null) ->
pending_table_character_tokens = null
head_element_pointer = null
flag_fragment_parsing = null
+ context_element = null
stop_parsing = ->
flag_parsing = false
@@ -617,6 +626,14 @@ parse_html = (txt, parse_error_cb = null) ->
node = open_els[node_i]
# 19. Return to the step labeled loop.
+ # 8.2.3.2
+
+ # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
+ adjusted_current_node = ->
+ if open_els.length is 1 and flag_fragment_parsing
+ return context_element
+ return open_els[0]
+
# http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
# this implementation is structured (mostly) as described at the link above.
# capitalized comments are the "labels" described at the link above.
@@ -1144,8 +1161,8 @@ parse_html = (txt, parse_error_cb = null) ->
doc.children.push t
return
if t.type is TYPE_DOCTYPE
+ # FIXME check identifiers, set quirks, etc
# fixfull
- t.name = 'html'
doc.children.push t
insertion_mode = ins_mode_before_html
return
@@ -1247,7 +1264,7 @@ parse_html = (txt, parse_error_cb = null) ->
# fixfull encoding stuff
return
if t.type is TYPE_START_TAG and t.name is 'title'
- parse_generic_rcdata_element t
+ parse_generic_rcdata_text t
return
if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
parse_generic_raw_text t
@@ -1259,7 +1276,7 @@ parse_html = (txt, parse_error_cb = null) ->
if t.type is TYPE_START_TAG and t.name is 'script'
ail = adjusted_insertion_location()
el = token_to_element t, NS_HTML, ail
- el.flag_parser_inserted true # FIXME implement
+ el.flag 'parser-inserted', true # FIXME implement
# fixfull frament case
ail[0].children.splice ail[1], 0, el
open_els.unshift el
@@ -1300,12 +1317,12 @@ parse_html = (txt, parse_error_cb = null) ->
parse_error()
return
ins_mode_in_head_else t
-
+
# 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
ins_mode_in_head_noscript = (t) ->
# FIXME ?fixfull
console.log "ins_mode_in_head_noscript unimplemented"
-
+
# 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
ins_mode_after_head_else = (t) ->
body_tok = new_open_tag 'body'
@@ -2270,6 +2287,7 @@ parse_html = (txt, parse_error_cb = null) ->
tok_state = tok_state_end_tag_open
when '?'
parse_error()
+ tok_cur_tag = new_comment_token '?'
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
@@ -2304,6 +2322,7 @@ parse_html = (txt, parse_error_cb = null) ->
tok_state = tok_state_tag_name
else
parse_error()
+ tok_cur_tag = new_comment_token '/'
tok_state = tok_state_bogus_comment
return null
@@ -2671,6 +2690,601 @@ parse_html = (txt, parse_error_cb = null) ->
cur -= 1 # we didn't handle that char
return null
+ # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
+ # WARNING: put a comment token in tok_cur_tag before setting this state
+ tok_state_bogus_comment = ->
+ next_gt = txt.indexOf '>', cur
+ if next_gt is -1
+ val = txt.substr cur
+ cur = txt.length
+ else
+ val = txt.substr cur, (next_gt - cur)
+ cur = next_gt + 1
+ val = val.replace "\u0000", "\ufffd"
+ tok_cur_tag.text += val
+ tok_state = tok_state_data
+ return tok_cur_tag
+
+ # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
+ tok_state_markup_declaration_open = ->
+ if txt.substr(cur, 2) is '--'
+ cur += 2
+ tok_cur_tag = new_comment_token ''
+ tok_state = tok_state_comment_start
+ return
+ if txt.substr(cur, 7).toLowerCase() is 'doctype'
+ cur += 7
+ tok_state = tok_state_doctype
+ return
+ acn = adjusted_current_node()
+ if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
+ cur += 7
+ tok_state = tok_state_cdata_section
+ return
+ # Otherwise
+ parse_error()
+ tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
+ tok_state = tok_state_bogus_comment
+ return
+
+ # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
+ tok_state_comment_start = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_start_dash
+ when "\u0000"
+ parse_error()
+ return new_character_token "\ufffd"
+ when '>'
+ parse_error()
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += c
+ return null
+
+ # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
+ tok_state_comment_start_dash = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_end
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "-\ufffd"
+ tok_state = tok_state_comment
+ when '>'
+ parse_error()
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += "-#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
+ tok_state_comment = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_end_dash
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "\ufffd"
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += c
+ return null
+
+ # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
+ tok_state_comment_end_dash = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_state = tok_state_comment_end
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "-\ufffd"
+ tok_state = tok_state_comment
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += "-#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
+ tok_state_comment_end = ->
+ switch c = txt.charAt(cur++)
+ when '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "--\ufffd"
+ tok_state = tok_state_comment
+ when '!'
+ parse_error()
+ tok_state = tok_state_comment_end_bang
+ when '-'
+ parse_error()
+ tok_cur_tag.text += '-'
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ parse_error()
+ tok_cur_tag.text += "--#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
+ tok_state_comment_end_bang = ->
+ switch c = txt.charAt(cur++)
+ when '-'
+ tok_cur_tag.text += "--!#{c}"
+ tok_state = tok_state_comment_end_dash
+ when '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ when "\u0000"
+ parse_error()
+ tok_cur_tag.text += "--!\ufffd"
+ tok_state = tok_state_comment
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ else
+ tok_cur_tag.text += "--!#{c}"
+ tok_state = tok_state_comment
+ return null
+
+ # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+ tok_state_doctype = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\u000a", "\u000c", ' '
+ tok_state = tok_state_before_doctype_name
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ el = new_doctype_token ''
+ el.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return el
+ else
+ parse_error()
+ tok_state = tok_state_before_doctype_name
+ cur -= 1 # Reconsume
+ return null
+
+ # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+ tok_state_before_doctype_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = new_doctype_token c.toLowerCase()
+ tok_state = tok_state_doctype_name
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag = new_doctype_token "\ufffd"
+ tok_state = tok_state_doctype_name
+ return
+ if c is '>'
+ parse_error()
+ el = new_doctype_token ''
+ el.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return el
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ el = new_doctype_token ''
+ el.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return el
+ # Anything else
+ tok_cur_tag = new_doctype_token c
+ tok_state = tok_state_doctype_name
+ return null
+
+ # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
+ tok_state_doctype_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_after_doctype_name
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag.name += c.toLowerCase()
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.name += "\ufffd"
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.name += c
+ return null
+
+ # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
+ tok_state_after_doctype_name = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ if txt.substr(cur - 1, 6).toLowerCase() is 'public'
+ cur += 5
+ tok_state = tok_state_after_doctype_public_keyword
+ return
+ if txt.substr(cur - 1, 6).toLowerCase() is 'system'
+ cur += 5
+ tok_state = tok_state_after_doctype_system_keyword
+ return
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
+ tok_state_after_doctype_public_keyword = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_before_doctype_public_identifier
+ return
+ if c is '"'
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
+ tok_state_before_doctype_public_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '"'
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+ tok_state = tok_state_doctype_public_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+
+ # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
+ tok_state_doctype_public_identifier_double_quoted = ->
+ c = txt.charAt(cur++)
+ if c is '"'
+ tok_state = tok_state_after_doctype_public_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.public_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.public_identifier += c
+ return null
+
+ # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
+ tok_state_doctype_public_identifier_single_quoted = ->
+ c = txt.charAt(cur++)
+ if c is "'"
+ tok_state = tok_state_after_doctype_public_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.public_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.public_identifier += c
+ return null
+
+ # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
+ tok_state_after_doctype_public_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_between_doctype_public_and_system_identifiers
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '"'
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
+ tok_state_between_doctype_public_and_system_identifiers = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '"'
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
+ tok_state_after_doctype_system_keyword = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ tok_state = tok_state_before_doctype_system_identifier
+ return
+ if c is '"'
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ parse_error()
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
+ tok_state_before_doctype_system_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '"'
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_double_quoted
+ return
+ if c is "'"
+ tok_cur_tag.system_identifier = ''
+ tok_state = tok_state_doctype_system_identifier_single_quoted
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
+ tok_state_doctype_system_identifier_double_quoted = ->
+ c = txt.charAt(cur++)
+ if c is '"'
+ tok_state = tok_state_after_doctype_system_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.system_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.system_identifier += c
+ return null
+
+ # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
+ tok_state_doctype_system_identifier_single_quoted = ->
+ c = txt.charAt(cur++)
+ if c is "'"
+ tok_state = tok_state_after_doctype_system_identifier
+ return
+ if c is "\u0000"
+ parse_error()
+ tok_cur_tag.system_identifier += "\ufffd"
+ return
+ if c is '>'
+ parse_error()
+ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ tok_cur_tag.system_identifier += c
+ return null
+
+ # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
+ tok_state_after_doctype_system_identifier = ->
+ c = txt.charAt(cur++)
+ if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+ return
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ tok_cur_tag.flag 'force-quirks', true
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ parse_error()
+ # do _not_ tok_cur_tag.flag 'force-quirks', true
+ tok_state = tok_state_bogus_doctype
+ return null
+
+ # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
+ tok_state_bogus_doctype = ->
+ c = txt.charAt(cur++)
+ if c is '>'
+ tok_state = tok_state_data
+ return tok_cur_tag
+ if c is '' # EOF
+ tok_state = tok_state_data
+ cur -= 1 # Reconsume
+ return tok_cur_tag
+ # Anything else
+ return null
+
+
# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
# Don't set this as a state, just call it
# returns a string (NOT a text node)
@@ -2764,6 +3378,7 @@ parse_html = (txt, parse_error_cb = null) ->
pending_table_character_tokens = []
head_element_pointer = null
flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
+ context_element = null # FIXME initialize from args.fragment
# tokenizer initialization
tok_state = tok_state_data
@@ -2776,15 +3391,6 @@ parse_html = (txt, parse_error_cb = null) ->
# fixfull parse error if has self-closing flag, but it wasn't acknolwedged
return doc.children
-test_results = passed: 0, failed: 0
-# everything below is tests on the above
-test_equals = (description, output, expected_output) ->
- if output is expected_output
- console.log "passed." # don't say name, so smart consoles can merge all of these
- else
- console.log "FAILED: \"#{description}\""
- console.log " Expected: #{expected_output}"
- console.log " Actual: #{output}"
serialize_els = (els, shallow, show_ids) ->
serialized = ''
sep = ''
@@ -2793,205 +3399,12 @@ serialize_els = (els, shallow, show_ids) ->
sep = ','
serialized += t.serialize shallow, show_ids
return serialized
-test_parser = (args) ->
- debug_log_reset()
- parse_errors = []
- errors_cb = (i) ->
- parse_errors.push i
- prev_node_id = 0 # reset counter
- parsed = parse_html args.html, errors_cb
- serialized = serialize_els parsed, false, false
- expected = 'tag:"html",{},[tag:"head",{},[],tag:"body",{},[' + args.expected + ']]'
- if serialized isnt expected
- debug_log_each (str) ->
- console.log str
- console.log "FAILED: \"#{args.name}\""
- console.log " Input: #{args.html}"
- console.log " Correct: #{expected}"
- console.log " Output: #{serialized}"
- if parse_errors.length > 0
- console.log " parse errs: #{JSON.stringify parse_errors}"
- else
- console.log " No parse errors"
- test_results.failed += 1
- else
- #console.log "passed \"#{args.name}\""
- test_results.passed += 1
-test_summary = ->
- console.log "Tests passed: #{test_results.passed}"
- console.log "Tests Failed: #{test_results.failed}"
-
-test_parser name: "empty", \
- html: "",
- expected: ''
-test_parser name: "just text", \
- html: "abc",
- expected: 'text:"abc"'
-test_parser name: "named entity", \
- html: "a&1234",
- expected: 'text:"a&1234"'
-test_parser name: "broken named character references", \
- html: "1&2&&3&aabbcc;",
- expected: 'text:"1&2&&3&aabbcc;"'
-test_parser name: "numbered entity overrides", \
- html: "1 ",
- expected: 'text:"1â¬â¬ Æ"'
-test_parser name: "open tag", \
- html: "foobar",
- expected: 'text:"foo",tag:"span",{},[text:"bar"]'
-test_parser name: "open tag with attributes", \
- html: "foobar",
- expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
-test_parser name: "open tag with attributes of various quotings", \
- html: "foobar",
- expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
-test_parser name: "attribute entity exceptions dq", \
- html: "foobar",
- expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
-test_parser name: "attribute entity exceptions sq", \
- html: "foobar",
- expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
-test_parser name: "attribute entity exceptions uq", \
- html: "foobar",
- expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
-test_parser name: "matching closing tags", \
- html: "foohi 12345 23second",
- # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
- # firefox does this:
- expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
-# tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
-test_parser name: "html5lib aaa 1", \
- html: '
2
3', - expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]' -test_parser name: "html5lib aaa 3", \ - html: '13', - expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]' -test_parser name: "html5lib aaa 4", \ - html: '123', - expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]' -test_parser name: "html5lib aaa 5 (two divs deep)", \ - html: '12 | 3
B | C