From bbfb2a7b296587d505da081e71435f4d56eb4236 Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Sun, 24 Nov 2013 18:05:15 -0500 Subject: [PATCH] optimize wiktionary parsing --- main.coffee | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.coffee b/main.coffee index 02b1bdb..860dd5d 100644 --- a/main.coffee +++ b/main.coffee @@ -647,14 +647,14 @@ extract_wiktionary_definiton = (html) -> # archive,codebase,data,usemap: # href: # id,class,style: background: url(foo.png), etc - html = html.replace /(src|onload|archive|codebase|data|usemap|href|style|id|class)=['"][^"']*['"]/ig, '', html + html = html.replace /[ ]?[a-z]+=['"][^"']*['"]/ig, '', html + html = html.replace /<\/?(audio|source|a|span|table|tr|td|table)>/ig, '', html + html = html.replace /\[edit\]/ig, '', html elements = $(html) valid_parts = ["Abbreviation", "Adjective", "Adverb", "Article", "Cardinal number", "Conjunction", "Determiner", "Interjection", "Noun", "Numeral", "Particle", "Preposition", "Pronoun", "Verb"] - edit_link_regex = new RegExp(' ?\\[edit\\] ?') - elements.each (i, el) -> #which tag: el.tagName if el.tagName is 'H2' @@ -663,10 +663,10 @@ extract_wiktionary_definiton = (html) -> if found return false # break part = false # mark us not being in a definition section unless the next section finds a part of speach header - language = $(el).text().replace(edit_link_regex, '') + language = $(el).text() if language and el.tagName is 'H3' or el.tagName is 'H4' # eg yak def uses one for english and one for dutch part = false - text = $(el).text().replace(edit_link_regex, '') + text = $(el).text() for p in valid_parts if text is "#{p}" part = p.toLowerCase() -- 1.7.10.4