From 6c4bd699f811de0097387b167097665b57c4b011 Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Tue, 22 Dec 2015 11:42:54 -0500 Subject: [PATCH] doctypes: parsing, tests pass --- parse-html.coffee | 437 ++++++++++++++++++++++++++++++++++++++++++++++++++++- test.coffee | 6 +- 2 files changed, 438 insertions(+), 5 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index 4c1d3cc..85a91af 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -166,6 +166,8 @@ new_text_node = (txt) -> new_character_token = new_text_node new_comment_token = (txt) -> return new Node TYPE_COMMENT, text: txt +new_doctype_token = (name) -> + return new Node TYPE_DOCTYPE, name: name new_eof_token = -> return new Node TYPE_EOF new_afe_marker = -> @@ -1159,8 +1161,8 @@ parse_html = (txt, parse_error_cb = null) -> doc.children.push t return if t.type is TYPE_DOCTYPE + # FIXME check identifiers, set quirks, etc # fixfull - t.name = 'html' doc.children.push t insertion_mode = ins_mode_before_html return @@ -1262,7 +1264,7 @@ parse_html = (txt, parse_error_cb = null) -> # fixfull encoding stuff return if t.type is TYPE_START_TAG and t.name is 'title' - parse_generic_rcdata_element t + parse_generic_rcdata_text t return if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style')) parse_generic_raw_text t @@ -1274,7 +1276,7 @@ parse_html = (txt, parse_error_cb = null) -> if t.type is TYPE_START_TAG and t.name is 'script' ail = adjusted_insertion_location() el = token_to_element t, NS_HTML, ail - el.flag_parser_inserted true # FIXME implement + el.flag 'parser-inserted', true # FIXME implement # fixfull frament case ail[0].children.splice ail[1], 0, el open_els.unshift el @@ -2720,7 +2722,7 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_cdata_section return # Otherwise - parse_errer() + parse_error() tok_cur_tag = new_comment_token '!' # TODO test ("!" right?) tok_state = tok_state_bogus_comment return @@ -2855,6 +2857,433 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_comment return null + # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state + tok_state_doctype = -> + switch c = txt.charAt(cur++) + when "\t", "\u000a", "\u000c", ' ' + tok_state = tok_state_before_doctype_name + when '' # EOF + parse_error() + tok_state = tok_state_data + el = new_doctype_token '' + el.flag 'force-quirks', true + cur -= 1 # Reconsume + return el + else + parse_error() + tok_state = tok_state_before_doctype_name + cur -= 1 # Reconsume + return null + + # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state + tok_state_before_doctype_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if uc_alpha.indexOf(c) > -1 + tok_cur_tag = new_doctype_token c.toLowerCase() + tok_state = tok_state_doctype_name + return + if c is "\u0000" + parse_error() + tok_cur_tag = new_doctype_token "\ufffd" + tok_state = tok_state_doctype_name + return + if c is '>' + parse_error() + el = new_doctype_token '' + el.flag 'force-quirks', true + tok_state = tok_state_data + return el + if c is '' # EOF + parse_error() + tok_state = tok_state_data + el = new_doctype_token '' + el.flag 'force-quirks', true + cur -= 1 # Reconsume + return el + # Anything else + tok_cur_tag = new_doctype_token c + tok_state = tok_state_doctype_name + return null + + # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state + tok_state_doctype_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_after_doctype_name + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if uc_alpha.indexOf(c) > -1 + tok_cur_tag.name += c.toLowerCase() + return + if c is "\u0000" + parse_error() + tok_cur_tag.name += "\ufffd" + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.name += c + return null + + # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state + tok_state_after_doctype_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + if txt.substr(cur - 1, 6).toLowerCase() is 'public' + cur += 5 + tok_state = tok_state_after_doctype_public_keyword + return + if txt.substr(cur - 1, 6).toLowerCase() is 'system' + cur += 5 + tok_state = tok_state_after_doctype_system_keyword + return + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state + tok_state_after_doctype_public_keyword = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_before_doctype_public_identifier + return + if c is '"' + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state + tok_state_before_doctype_public_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '"' + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + + # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state + tok_state_doctype_public_identifier_double_quoted = -> + c = txt.charAt(cur++) + if c is '"' + tok_state = tok_state_after_doctype_public_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.public_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.public_identifier += c + return null + + # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state + tok_state_doctype_public_identifier_single_quoted = -> + c = txt.charAt(cur++) + if c is "'" + tok_state = tok_state_after_doctype_public_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.public_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.public_identifier += c + return null + + # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state + tok_state_after_doctype_public_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_between_doctype_public_and_system_identifiers + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '"' + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state + tok_state_between_doctype_public_and_system_identifiers = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '"' + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state + tok_state_after_doctype_system_keyword = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_before_doctype_system_identifier + return + if c is '"' + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state + tok_state_before_doctype_system_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '"' + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state + tok_state_doctype_system_identifier_double_quoted = -> + c = txt.charAt(cur++) + if c is '"' + tok_state = tok_state_after_doctype_system_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.system_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.system_identifier += c + return null + + # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state + tok_state_doctype_system_identifier_single_quoted = -> + c = txt.charAt(cur++) + if c is "'" + tok_state = tok_state_after_doctype_system_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.system_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.system_identifier += c + return null + + # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state + tok_state_after_doctype_system_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + # do _not_ tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state + tok_state_bogus_doctype = -> + c = txt.charAt(cur++) + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + return null + # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it diff --git a/test.coffee b/test.coffee index baea4e7..9e82bdd 100644 --- a/test.coffee +++ b/test.coffee @@ -7809,7 +7809,11 @@ serialize_els = (els, prefix = '| ') -> when wheic.TYPE_COMMENT ret += "#{prefix}\n" when wheic.TYPE_DOCTYPE - ret += "#{prefix}\n" # FIXME add ids + ret += "#{prefix} 0) or (el.system_identifier? and el.system_identifier.length > 0) + ret += " \"#{el.public_identifier ? ''}\"" + ret += " \"#{el.system_identifier ? ''}\"" + ret += ">\n" else ret += "#{prefix}UNKNOWN TAG TYPE #{el.type}" return ret -- 1.7.10.4