X-Git-Url: https://jasonwoof.com/gitweb/?p=peach-html5-editor.git;a=blobdiff_plain;f=parser.js;h=1d8a53ac05647e85043700e6b1dde90add460361;hp=ea87c858bd44622989ace630e8cfb56fe620cea4;hb=HEAD;hpb=5aef791edd38fb3d70a71266ad0b42cf9fb45593 diff --git a/parser.js b/parser.js index ea87c85..1d8a53a 100644 --- a/parser.js +++ b/parser.js @@ -1,5 +1,5 @@ -// todo remove refs and lens, js, ls -// run test suite! +// todo remove unused variables +// todo remove debug log, or make a way to access it // Copyright 2015 Jason Woofenden // This file implements an HTML5 parser @@ -41,7 +41,7 @@ // // See README.md for how to run this file in the browser or in node.js. // -// This file exports a single useful function: parse_tml, and some constants +// This file exports a single useful function: parse, and some constants // (see the bottom of this file for those.) // // Call it like this: @@ -52,7 +52,13 @@ // // peach_parser.parse("

hi

", {fragment: "body"}) // -// return value is an array of Nodes, see "class Node" below. +// return value is an array of Nodes, A Node contains: +// type: one of: "tag", "text", "comment", "doctype" +// text: contents for text/comment nodes +// attrs: object of attributes, eg {href: "#main"} +// children: array of Nodes +// namespace: one of: "html", "mathml", "svg" +// parent: another Node or null // This code is a work in progress, eg try search this file for "fixfull", // "TODO" and "FIXME" @@ -77,14 +83,15 @@ // 2: c "next", "after", "lower", "below" // 1: b // 0: a "end of the list", "current node", "bottommost", "last" +(function () { + +var NS_HTML, NS_MATHML, NS_SVG, QUIRKS_LIMITED, QUIRKS_NO, QUIRKS_YES, TYPE_AAA_BOOKMARK, TYPE_AFE_MARKER, TYPE_COMMENT, TYPE_DOCTYPE, TYPE_END_TAG, TYPE_EOF, TYPE_START_TAG, TYPE_TAG, TYPE_TEXT, _decode_named_char_ref, adjust_foreign_attributes, adjust_mathml_attributes, adjust_svg_attributes, adp_els, alnum, context, debug_log, debug_log_each, debug_log_reset, decode_named_char_ref, decode_named_char_ref_cache, decode_named_char_ref_el, digits, el_is_special, el_is_special_not_adp, end_tag_implied, exports, foreign_attr_fixes, formatting_elements, foster_parenting_targets, from_code_point, g_debug_log, h_tags, hex_chars, is_html_integration, is_input_hidden_tok, is_lc_alpha, is_mathml_text_integration_point, is_space, is_space_tok, is_uc_alpha, lc_alpha, legacy_char_refs, mathml_elements, mathml_text_integration, new_aaa_bookmark, new_afe_marker, new_character_token, new_comment_token, new_doctype_token, new_element, new_end_tag, new_eof_token, new_open_tag, new_text_node, parse_html, prev_node_id, quirks_yes_pi_prefixes, space_chars, special_elements, svg_attribute_fixes, svg_elements, svg_name_fixes, tag_name_chars, uc_alpha, unicode_fixes, whitespace_chars if ((typeof module) !== 'undefined' && (module.exports != null)) { context = 'module' - exports = module.exports } else { context = 'browser' window.peach_parser = {} - exports = window.peach_parser } from_code_point = function (x) { @@ -145,10 +152,11 @@ function Node (type, args) { this.name = args.name != null ? args.name : '' // tag name this.text = args.text != null ? args.text : '' // contents for text/comment nodes this.attrs = args.attrs != null ? args.attrs : {} - this.attrs_a = args.attr_k != null ? args.attr_k : [] // attrs in progress, TYPE_START_TAG only this.children = args.children != null ? args.children : [] this.namespace = args.namespace != null ? args.namespace : NS_HTML this.parent = args.parent != null ? args.parent : null + // private: + this.attrs_a = args.attr_k != null ? args.attr_k : [] // attrs in progress, TYPE_START_TAG only this.token = args.token != null ? args.token : null this.flags = args.flags != null ? args.flags : {} if (args.id != null) { @@ -697,7 +705,7 @@ decode_named_char_ref = function (txt) { } parse_html = function (args_html, args) { - var adjusted_current_node, adjusted_insertion_location, adoption_agency, afe, afe_push, afe_push_marker, button_scopers, clear_afe_to_marker, clear_stack_to_table_body_context, clear_stack_to_table_context, clear_stack_to_table_row_context, clear_to_table_body_stopers, clear_to_table_row_stopers, clear_to_table_stopers, close_p_element, close_p_if_in_button_scope, close_the_cell, context_element, cur, doc, eat_next_token_if_newline, el_is_in_scope, flag_foster_parenting, flag_fragment_parsing, flag_frameset_ok, flag_parsing, flag_scripting, form_element_pointer, fragment_root, generate_implied_end_tags, has_color_face_or_size, head_element_pointer, in_body_any_other_end_tag, in_foreign_content, in_foreign_content_end_script, in_foreign_content_other_start, ins_mode, ins_mode_after_after_body, ins_mode_after_after_frameset, ins_mode_after_body, ins_mode_after_frameset, ins_mode_after_head, ins_mode_after_head_else, ins_mode_before_head, ins_mode_before_html, ins_mode_in_body, ins_mode_in_caption, ins_mode_in_cell, ins_mode_in_column_group, ins_mode_in_frameset, ins_mode_in_head, ins_mode_in_head_else, ins_mode_in_head_noscript, ins_mode_in_head_noscript_else, ins_mode_in_row, ins_mode_in_select, ins_mode_in_select_in_table, ins_mode_in_table, ins_mode_in_table_body, ins_mode_in_table_else, ins_mode_in_table_text, ins_mode_in_template, ins_mode_initial, ins_mode_text, insert_character, insert_comment, insert_foreign_element, insert_html_element, is_appropriate_end_tag, is_in_button_scope, is_in_li_scope, is_in_scope, is_in_scope_x, is_in_scope_x_y, is_in_select_scope, is_in_table_scope, is_quirks_limited_doctype, is_quirks_yes_doctype, li_scopers, open_els, original_ins_mode, parse_character_reference, parse_error, parse_generic_raw_text, parse_generic_rcdata_text, parse_init, parse_main_loop, pending_table_character_tokens, process_token, reconstruct_afe, ref, reset_ins_mode, standard_scopers, stop_parsing, table_scopers, template_ins_modes, template_tag_is_open, temporary_buffer, tok_cur_tag, tok_state, tok_state_after_attribute_name, tok_state_after_attribute_value_quoted, tok_state_after_doctype_name, tok_state_after_doctype_public_identifier, tok_state_after_doctype_public_keyword, tok_state_after_doctype_system_identifier, tok_state_after_doctype_system_keyword, tok_state_attribute_name, tok_state_attribute_value_double_quoted, tok_state_attribute_value_single_quoted, tok_state_attribute_value_unquoted, tok_state_before_attribute_name, tok_state_before_attribute_value, tok_state_before_doctype_name, tok_state_before_doctype_public_identifier, tok_state_before_doctype_system_identifier, tok_state_between_doctype_public_and_system_identifiers, tok_state_bogus_comment, tok_state_bogus_doctype, tok_state_cdata_section, tok_state_comment, tok_state_comment_end, tok_state_comment_end_bang, tok_state_comment_end_dash, tok_state_comment_start, tok_state_comment_start_dash, tok_state_data, tok_state_doctype, tok_state_doctype_name, tok_state_doctype_public_identifier_double_quoted, tok_state_doctype_public_identifier_single_quoted, tok_state_doctype_system_identifier_double_quoted, tok_state_doctype_system_identifier_single_quoted, tok_state_end_tag_open, tok_state_markup_declaration_open, tok_state_plaintext, tok_state_rawtext, tok_state_rawtext_end_tag_name, tok_state_rawtext_end_tag_open, tok_state_rawtext_less_than_sign, tok_state_rcdata, tok_state_rcdata_end_tag_name, tok_state_rcdata_end_tag_open, tok_state_rcdata_less_than_sign, tok_state_script_data, tok_state_script_data_double_escape_end, tok_state_script_data_double_escape_start, tok_state_script_data_double_escaped, tok_state_script_data_double_escaped_dash, tok_state_script_data_double_escaped_dash_dash, tok_state_script_data_double_escaped_less_than_sign, tok_state_script_data_end_tag_name, tok_state_script_data_end_tag_open, tok_state_script_data_escape_start, tok_state_script_data_escape_start_dash, tok_state_script_data_escaped, tok_state_script_data_escaped_dash, tok_state_script_data_escaped_dash_dash, tok_state_script_data_escaped_end_tag_name, tok_state_script_data_escaped_end_tag_open, tok_state_script_data_escaped_less_than_sign, tok_state_script_data_less_than_sign, tok_state_self_closing_start_tag, tok_state_tag_name, tok_state_tag_open, token_to_element, txt + var adjusted_current_node, adjusted_insertion_location, adoption_agency, afe, afe_push, afe_push_marker, button_scopers, clear_afe_to_marker, clear_stack_to_table_body_context, clear_stack_to_table_context, clear_stack_to_table_row_context, clear_to_table_body_stopers, clear_to_table_row_stopers, clear_to_table_stopers, close_p_element, close_p_if_in_button_scope, close_the_cell, context_element, cur, doc, eat_next_token_if_newline, el_is_in_scope, flag_foster_parenting, flag_fragment_parsing, flag_frameset_ok, flag_parsing, flag_scripting, form_element_pointer, fragment_root, generate_implied_end_tags, has_color_face_or_size, head_element_pointer, in_body_any_other_end_tag, in_foreign_content, in_foreign_content_end_script, in_foreign_content_other_start, ins_mode, ins_mode_after_after_body, ins_mode_after_after_frameset, ins_mode_after_body, ins_mode_after_frameset, ins_mode_after_head, ins_mode_after_head_else, ins_mode_before_head, ins_mode_before_html, ins_mode_in_body, ins_mode_in_caption, ins_mode_in_cell, ins_mode_in_column_group, ins_mode_in_frameset, ins_mode_in_head, ins_mode_in_head_else, ins_mode_in_head_noscript, ins_mode_in_head_noscript_else, ins_mode_in_row, ins_mode_in_select, ins_mode_in_select_in_table, ins_mode_in_table, ins_mode_in_table_body, ins_mode_in_table_else, ins_mode_in_table_text, ins_mode_in_template, ins_mode_initial, ins_mode_text, insert_character, insert_comment, insert_foreign_element, insert_html_element, is_appropriate_end_tag, is_in_button_scope, is_in_li_scope, is_in_scope, is_in_scope_x, is_in_scope_x_y, is_in_select_scope, is_in_table_scope, is_quirks_limited_doctype, is_quirks_yes_doctype, li_scopers, open_els, original_ins_mode, parse_character_reference, parse_error, parse_generic_raw_text, parse_generic_rcdata_text, parse_init, parse_main_loop, pending_table_character_tokens, process_token, reconstruct_afe, reset_ins_mode, standard_scopers, stop_parsing, table_scopers, template_ins_modes, template_tag_is_open, temporary_buffer, tok_cur_tag, tok_state, tok_state_after_attribute_name, tok_state_after_attribute_value_quoted, tok_state_after_doctype_name, tok_state_after_doctype_public_identifier, tok_state_after_doctype_public_keyword, tok_state_after_doctype_system_identifier, tok_state_after_doctype_system_keyword, tok_state_attribute_name, tok_state_attribute_value_double_quoted, tok_state_attribute_value_single_quoted, tok_state_attribute_value_unquoted, tok_state_before_attribute_name, tok_state_before_attribute_value, tok_state_before_doctype_name, tok_state_before_doctype_public_identifier, tok_state_before_doctype_system_identifier, tok_state_between_doctype_public_and_system_identifiers, tok_state_bogus_comment, tok_state_bogus_doctype, tok_state_cdata_section, tok_state_comment, tok_state_comment_end, tok_state_comment_end_bang, tok_state_comment_end_dash, tok_state_comment_start, tok_state_comment_start_dash, tok_state_data, tok_state_doctype, tok_state_doctype_name, tok_state_doctype_public_identifier_double_quoted, tok_state_doctype_public_identifier_single_quoted, tok_state_doctype_system_identifier_double_quoted, tok_state_doctype_system_identifier_single_quoted, tok_state_end_tag_open, tok_state_markup_declaration_open, tok_state_plaintext, tok_state_rawtext, tok_state_rawtext_end_tag_name, tok_state_rawtext_end_tag_open, tok_state_rawtext_less_than_sign, tok_state_rcdata, tok_state_rcdata_end_tag_name, tok_state_rcdata_end_tag_open, tok_state_rcdata_less_than_sign, tok_state_script_data, tok_state_script_data_double_escape_end, tok_state_script_data_double_escape_start, tok_state_script_data_double_escaped, tok_state_script_data_double_escaped_dash, tok_state_script_data_double_escaped_dash_dash, tok_state_script_data_double_escaped_less_than_sign, tok_state_script_data_end_tag_name, tok_state_script_data_end_tag_open, tok_state_script_data_escape_start, tok_state_script_data_escape_start_dash, tok_state_script_data_escaped, tok_state_script_data_escaped_dash, tok_state_script_data_escaped_dash_dash, tok_state_script_data_escaped_end_tag_name, tok_state_script_data_escaped_end_tag_open, tok_state_script_data_escaped_less_than_sign, tok_state_script_data_less_than_sign, tok_state_self_closing_start_tag, tok_state_tag_name, tok_state_tag_open, token_to_element, txt if (args == null) { args = {} } @@ -736,7 +744,7 @@ parse_html = function (args_html, args) { // http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements // "Noah's Ark clause" but with three afe_push = function (new_el) { - var attrs_match, el, i, j, k, len, matches, ref, ref1, v + var attrs_match, el, i, j, k, matches, v matches = 0 for (i = 0; i < afe.length; ++i) { el = afe[i] @@ -1146,7 +1154,7 @@ parse_html = function (args_html, args) { // http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p // http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements adoption_agency = function (subject) { - var aa, ab, ac, ad, ae, af, bookmark, c, ca, dest, el, fb, fb_of_open_els, fe, fe_of_afe, fe_of_open_els, i, in_afe, in_open_els, inner, j, l, last_node, len, len1, len10, len11, len12, len13, len14, len15, len16, len17, len2, len3, len4, len5, len6, len7, len8, len9, m, n, new_node, node, node_above, node_in_afe, node_next, o, outer, q, r, ref, ref1, s, t, u, w, y, z + var aa, ab, ac, ad, ae, af, bookmark, c, ca, dest, el, fb, fb_of_open_els, fe, fe_of_afe, fe_of_open_els, i, in_afe, in_open_els, inner, j, l, last_node, len, len1, len10, len11, len12, len13, len14, len15, len16, len17, len2, len3, len4, len5, len6, len7, len8, len9, m, n, new_node, node, node_above, node_in_afe, node_next, o, outer, q, r, s, t, u, w, y, z // this block implements tha W3C spec // # 1. If the current node is an HTML element whose tag name is subject, // # then run these substeps: @@ -1319,28 +1327,20 @@ parse_html = function (args_html, args) { // the list of active formatting elements, then remove node from // the list of active formatting elements. node_in_afe = false - for (i = 0; i < afe.length; ++i) { - t = afe[i] - if (t === node) { - if (inner > 3) { - afe.splice(i, 1) - } else { - node_in_afe = true - } - break + if ((i = afe.indexOf(node)) !== -1) { + if (inner > 3) { + afe.splice(i, 1) + } else { + node_in_afe = true } } // 6. If node is not in the list of active formatting elements, // then remove node from the stack of open elements and then go // back to the step labeled inner loop. if (!node_in_afe) { - for (i = 0; i < open_els.length; ++i) { - t = open_els[i] - if (t === node) { - node_above = open_els[i + 1] - open_els.splice(i, 1) - break - } + if ((i = open_els.indexOf(node)) !== -1) { + node_above = open_els[i + 1] + open_els.splice(i, 1) } continue } @@ -1352,51 +1352,31 @@ parse_html = function (args_html, args) { // elements with an entry for the new element, and let node be // the new element. new_node = token_to_element(node.token, NS_HTML, ca) - for (i = 0; i < afe.length; ++i) { - t = afe[i] - if (t === node) { - afe[i] = new_node - break - } + if ((i = afe.indexOf(node)) !== -1) { + afe[i] = new_node } - for (i = 0; i < open_els.length; ++i) { - t = open_els[i] - if (t === node) { - node_above = open_els[i + 1] - open_els[i] = new_node - break - } + if ((i = open_els.indexOf(node)) !== -1) { + node_above = open_els[i + 1] + open_els[i] = new_node } node = new_node // 8. If last node is furthest block, then move the // aforementioned bookmark to be immediately after the new node // in the list of active formatting elements. if (last_node === fb) { - for (i = 0; i < afe.length; ++i) { - t = afe[i] - if (t === bookmark) { - afe.splice(i, 1) - break - } + if ((i = afe.indexOf(bookmark)) !== -1) { + afe.splice(i, 1) } - for (i = 0; i < afe.length; ++i) { - t = afe[i] - if (t === node) { - // "after" means lower - afe.splice(i, 0, bookmark) // "after as <- - break - } + if ((i = afe.indexOf(node)) !== -1) { + // "after" means lower + afe.splice(i, 0, bookmark) // "after as <- } } // 9. Insert last node into node, first removing it from its // previous parent node if any. if (last_node.parent != null) { - for (i = 0; i < last_node.parent.children.length; ++i) { - c = last_node.parent.children[i] - if (c === last_node) { - last_node.parent.children.splice(i, 1) - break - } + if ((i = last_node.parent.children.indexOf(last_node)) !== -1) { + last_node.parent.children.splice(i, 1) } } node.children.push(last_node) @@ -1414,12 +1394,8 @@ parse_html = function (args_html, args) { // * last_node is fb // * last_node is still in the tree (not a duplicate) if (last_node.parent != null) { - for (i = 0; i < last_node.parent.children.length; ++i) { - c = last_node.parent.children[i] - if (c === last_node) { - last_node.parent.children.splice(i, 1) - break - } + if ((i = last_node.parent.children.indexOf(last_node)) !== -1) { + last_node.parent.children.splice(i, 1) } } // can't use standard insert token thing, because it's already in @@ -1445,36 +1421,20 @@ parse_html = function (args_html, args) { // elements, and insert the new element into the list of active // formatting elements at the position of the aforementioned // bookmark. - for (i = 0; i < afe.length; ++i) { - t = afe[i] - if (t === fe) { - afe.splice(i, 1) - break - } + if ((i = afe.indexOf(fe)) !== -1) { + afe.splice(i, 1) } - for (i = 0; i < afe.length; ++i) { - t = afe[i] - if (t === bookmark) { - afe[i] = new_element - break - } + if ((i = afe.indexOf(bookmark)) !== -1) { + afe[i] = new_element } // 19. Remove formatting element from the stack of open elements, // and insert the new element into the stack of open elements // immediately below the position of furthest block in that stack. - for (i = 0; i < open_els.length; ++i) { - t = open_els[i] - if (t === fe) { - open_els.splice(i, 1) - break - } + if ((i = open_els.indexOf(fe)) !== -1) { + open_els.splice(i, 1) } - for (i = 0; i < open_els.length; ++i) { - t = open_els[i] - if (t === fb) { - open_els.splice(i, 0, new_element) - break - } + if ((i = open_els.indexOf(fb)) !== -1) { + open_els.splice(i, 0, new_element) } // 20. Jump back to the step labeled outer loop. } @@ -1559,7 +1519,7 @@ parse_html = function (args_html, args) { // http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes // http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node adjusted_insertion_location = function (override_target) { - var c, el, i, j, l, last_table, last_table_i, last_template, last_template_i, len, len1, len2, m, previous_element, ref, target, target_i + var c, el, i, j, l, last_table, last_table_i, last_template, last_template_i, len, len1, len2, m, previous_element, target, target_i // 1. If there was an override target specified, then let target be the // override target. if (override_target != null) { @@ -1713,6 +1673,7 @@ parse_html = function (args_html, args) { position = adjusted_insertion_location() } position[0].children.splice(position[1], 0, t) + t.parent = position[0] return } @@ -1959,6 +1920,7 @@ parse_html = function (args_html, args) { el.flag('parser-inserted', true) // fixfull frament case ail[0].children.splice(ail[1], 0, el) + el.parent = ail[0] open_els.unshift(el) tok_state = tok_state_script_data original_ins_mode = ins_mode // make sure orig... is defined @@ -2143,7 +2105,7 @@ parse_html = function (args_html, args) { } } ins_mode_in_body = function (t) { - var a, aa, ab, ac, el, found, h_in_scope, i, input_el, j, l, len, len1, len10, len11, len12, len13, len14, len2, len3, len4, len5, len6, len7, len8, len9, m, n, node, o, ok_tags, prompt, q, r, ref, ref1, ref2, ref3, ref4, root_attrs, s, second, second_i, u, w, y, z + var a, aa, ab, ac, el, found, h_in_scope, i, input_el, j, l, len, len1, len10, len11, len12, len13, len14, len2, len3, len4, len5, len6, len7, len8, len9, m, n, node, o, ok_tags, prompt, q, r, root_attrs, s, second, second_i, u, w, y, z if (t.type === TYPE_TEXT && t.text === "\u0000") { parse_error() return @@ -6062,17 +6024,15 @@ parse_html = function (args_html, args) { return doc.children } -exports.parse = parse_html -exports.Node = Node -exports.debug_log_reset = debug_log_reset -exports.debug_log_each = debug_log_each -exports.TYPE_TAG = TYPE_TAG -exports.TYPE_TEXT = TYPE_TEXT -exports.TYPE_COMMENT = TYPE_COMMENT -exports.TYPE_DOCTYPE = TYPE_DOCTYPE -exports.NS_HTML = NS_HTML -exports.NS_MATHML = NS_MATHML -exports.NS_SVG = NS_SVG -exports.QUIRKS_NO = QUIRKS_NO -exports.QUIRKS_LIMITED = QUIRKS_LIMITED -exports.QUIRKS_YES = QUIRKS_YES +var this_module = { + parse: parse_html, + Node: Node, +} + +if (context === 'module') { + module.exports = this_module +} else { + window.peach_parser = this_module +} + +}).call(this)