optimize wiktionary parsing

author Jason Woofenden <jason@jasonwoof.com>

Sun, 24 Nov 2013 23:05:15 +0000 (18:05 -0500)

committer Jason Woofenden <jason@jasonwoof.com>

Sun, 24 Nov 2013 23:05:15 +0000 (18:05 -0500)
author Jason Woofenden <jason@jasonwoof.com>
Sun, 24 Nov 2013 23:05:15 +0000 (18:05 -0500)
committer Jason Woofenden <jason@jasonwoof.com>
Sun, 24 Nov 2013 23:05:15 +0000 (18:05 -0500)
diff --git a/main.coffee b/main.coffee

index 02b1bdb..860dd5d 100644 (file)
--- a/main.coffee
+++ b/main.coffee
@@ -647,14 +647,14 @@ extract_wiktionary_definiton = (html) ->
         #   archive,codebase,data,usemap: <object>
         #                           href: <link>
         #                 id,class,style: background: url(foo.png), etc
-       html = html.replace /(src|onload|archive|codebase|data|usemap|href|style|id|class)=['"][^"']*['"]/ig, '', html
+       html = html.replace /[ ]?[a-z]+=['"][^"']*['"]/ig, '', html
+       html = html.replace /<\/?(audio|source|a|span|table|tr|td|table)>/ig, '', html
+       html = html.replace /\[edit\]/ig, '', html
  
         elements = $(html)
  
         valid_parts = ["Abbreviation", "Adjective", "Adverb", "Article", "Cardinal number", "Conjunction", "Determiner", "Interjection", "Noun", "Numeral", "Particle", "Preposition", "Pronoun", "Verb"]
  
-       edit_link_regex = new RegExp(' ?\\[edit\\] ?')
-
         elements.each (i, el) ->
                 #which tag: el.tagName
                 if el.tagName is 'H2'
@@ -663,10 +663,10 @@ extract_wiktionary_definiton = (html) ->
                         if found
                                 return false # break
                         part = false # mark us not being in a definition section unless the next section finds a part of speach header
-                       language = $(el).text().replace(edit_link_regex, '')
+                       language = $(el).text()
                 if language and el.tagName is 'H3' or el.tagName is 'H4' # eg yak def uses one for english and one for dutch
                         part = false
-                       text = $(el).text().replace(edit_link_regex, '')
+                       text = $(el).text()
                         for p in valid_parts
                                 if text is "#{p}"
                                         part = p.toLowerCase()
author	Jason Woofenden <jason@jasonwoof.com>
	Sun, 24 Nov 2013 23:05:15 +0000 (18:05 -0500)
committer	Jason Woofenden <jason@jasonwoof.com>
	Sun, 24 Nov 2013 23:05:15 +0000 (18:05 -0500)