Got questions, comments, patches, etc.?
Contact Jason Woofenden
gitweb
/
hexbog.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
0521f06
)
optimize wiktionary parsing
author
Jason Woofenden
<jason@jasonwoof.com>
Sun, 24 Nov 2013 23:05:15 +0000
(18:05 -0500)
committer
Jason Woofenden
<jason@jasonwoof.com>
Sun, 24 Nov 2013 23:05:15 +0000
(18:05 -0500)
main.coffee
patch
|
blob
|
history
diff --git
a/main.coffee
b/main.coffee
index
02b1bdb
..
860dd5d
100644
(file)
--- a/
main.coffee
+++ b/
main.coffee
@@
-647,14
+647,14
@@
extract_wiktionary_definiton = (html) ->
# archive,codebase,data,usemap: <object>
# href: <link>
# id,class,style: background: url(foo.png), etc
# archive,codebase,data,usemap: <object>
# href: <link>
# id,class,style: background: url(foo.png), etc
- html = html.replace /(src|onload|archive|codebase|data|usemap|href|style|id|class)=['"][^"']*['"]/ig, '', html
+ html = html.replace /[ ]?[a-z]+=['"][^"']*['"]/ig, '', html
+ html = html.replace /<\/?(audio|source|a|span|table|tr|td|table)>/ig, '', html
+ html = html.replace /\[edit\]/ig, '', html
elements = $(html)
valid_parts = ["Abbreviation", "Adjective", "Adverb", "Article", "Cardinal number", "Conjunction", "Determiner", "Interjection", "Noun", "Numeral", "Particle", "Preposition", "Pronoun", "Verb"]
elements = $(html)
valid_parts = ["Abbreviation", "Adjective", "Adverb", "Article", "Cardinal number", "Conjunction", "Determiner", "Interjection", "Noun", "Numeral", "Particle", "Preposition", "Pronoun", "Verb"]
- edit_link_regex = new RegExp(' ?\\[edit\\] ?')
-
elements.each (i, el) ->
#which tag: el.tagName
if el.tagName is 'H2'
elements.each (i, el) ->
#which tag: el.tagName
if el.tagName is 'H2'
@@
-663,10
+663,10
@@
extract_wiktionary_definiton = (html) ->
if found
return false # break
part = false # mark us not being in a definition section unless the next section finds a part of speach header
if found
return false # break
part = false # mark us not being in a definition section unless the next section finds a part of speach header
- language = $(el).text().replace(edit_link_regex, '')
+ language = $(el).text()
if language and el.tagName is 'H3' or el.tagName is 'H4' # eg yak def uses one for english and one for dutch
part = false
if language and el.tagName is 'H3' or el.tagName is 'H4' # eg yak def uses one for english and one for dutch
part = false
- text = $(el).text().replace(edit_link_regex, '')
+ text = $(el).text()
for p in valid_parts
if text is "#{p}"
part = p.toLowerCase()
for p in valid_parts
if text is "#{p}"
part = p.toLowerCase()