-
Notifications
You must be signed in to change notification settings - Fork 0
/
trechelpers.py
141 lines (116 loc) · 6.42 KB
/
trechelpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import math
import enchant
import nltk
import stop_words
import string
import os
from natsort import natsorted
import xml.etree.ElementTree as et
## function parses XML tree, returns a dictionary object to be used by other functions
## specify which parent element the child node you want to retrieve the text from, as well as
## the child element. By default, it will pull the summary text from the 'topic' XML element
def get_xml_text(filename, parent_element='topic', child_element='summary'):
inFile = open(filename, 'r') # creates file object with inFile
tree = et.parse(inFile) # creates an element tree object of the inFile XML
root = tree.getroot() # determines root of XML
xmlOutput = {} # initializes the array which will hold topicNum:processedText
for topic in root.findall(parent_element): # loops through each parent element
key = str(topic.get('number'))
value = topic.find(child_element).text # returns text within the child element
xmlOutput[key] = value
return xmlOutput
## function iterates through each word in a passed dictionary and adds words not found
## in the dictionary to the end of the query text, you can control how many words are added
## by the optional percent parameter, by default it adds the ceiling of 20% of the words
## suggested by the enchant spell check.
def check_spelling(dictionary, percent=0.20):
d = enchant.Dict("en_US") # defines dictionary to use
for key, value in dictionary.items(): # iterates through passed dictionary of query text
addlWords = []
for word in value.split():
if not d.check(word):
suggestions = d.suggest(word)
if suggestions:
numInclude = math.ceil(len(suggestions) * percent)
addlWords.append(' '.join(suggestions[0:numInclude]))
dictionary[key] = dictionary[key] + ' ' + ' '.join(addlWords)
return dictionary
## function takes a dictionary/array of queryNumber : Text pairs and runs it through the
## NLTK plain language tokenizer, you can change the tokenizer used, see nltk.org for details
def tokenize_text(dictionary, tokenizer=nltk.TweetTokenizer()):
tknzr = tokenizer # sets the tokenizer used
for key, value in dictionary.items(): # iterates through dictionary
tokenText = tknzr.tokenize(value) # tokenizes the text in the dictionary
cleanText = [] # empty list for cleaned text
for word in tokenText: # removes all punctuation per indri specs
cleanWord = word
for item in string.punctuation: # interates through list of punctuation, replaces with space
cleanWord = cleanWord.replace(str(item), ' ')
cleanWord = cleanWord.strip(
string.punctuation).lower() # removes list items that are only punct and makes LC
cleanWord = cleanWord.strip() # removes empty space
if cleanWord: # to ignore now empty list elements
cleanText.append(cleanWord) # adds word to list
dictionary[key] = ' '.join(map(str, cleanText)) # writes list back out to dictionary
return dictionary
## function merges two dictionaries with the same key into indri trec-formatted query file and allows
## you to set weight for Indri to give the text from each dictionary and writes the bsub file to
## call with 'bsub < "filename.bsub"' on Killdevil (you'll need to edit the
## write bsub function to point to your directory
## p.s. this is super lame code that doesn't even use an XMLobject, but it's easy :)
def write_to_trec_two(name, dictionary1, dictionary2, weight1=1, weight2=1, dir=''):
w1 = str(weight1)
w2 = str(weight2)
if not os.path.exists(dir):
os.makedirs(dir)
outFile = open(os.path.join(dir, (name + '.xml')), 'w')
outFile.write('<parameter>\n')
for key in natsorted(dictionary1.keys()):
outFile.write('<query>\n')
outFile.write('\t<number>' + str(key) + '</number>\n')
outFile.write('\t<text>\n')
outFile.write('\t\t#weight( ' + str(weight1) + ' #combine(' + dictionary1[key] + ')\n')
outFile.write('\t\t ' + str(weight2) + ' #combine(' + dictionary2[key] + '))\n')
outFile.write('\t</text>\n')
outFile.write('</query>\n')
outFile.write('</parameter>\n')
outFile.close()
## simply writes a dictionary out to indri trec-formatted query file
## p.s. this is super lame code that doesn't even use an XMLobject, but it's easy :)
def write_to_trec_one(name, dictionary, dir=''):
outFile = open(os.path.join(dir, (name + '.xml')), 'w')
outFile.write('<parameter>\n')
for key in natsorted(dictionary.keys()):
outFile.write('<query>\n')
outFile.write('\t<number>' + str(key) + '</number>\n')
outFile.write('\t<text>\n')
outFile.write('\t\t#combine(' + dictionary[key] + ')\n')
outFile.write('\t</text>\n')
outFile.write('</query>\n')
outFile.write('</parameter>\n')
outFile.close()
## this function generates the bsub command file, based on the parameters passed and
## then you can call it on KillDevil with the ' bsub < "name.bsub" ' command.
## Defaults to the day queue and pwd, you can spec a directory from the location where you run it.
def generate_bsub_file(name, KDdir='', dir='', queue='day'):
bsubFile = open(os.path.join(dir, (name + '.bsub')), 'w')
bsubFile.write('#BSUB -J ' + name)
bsubFile.write('\n#BSUB -q ' + queue)
bsubFile.write('\n#BSUB -o ' + KDdir + '/' + name + '-%J.output')
bsubFile.write(
'\n/webdex/expir/indri/indri-5.4/runquery/IndriRunQuery ' + KDdir + '/' + name + '.xml -index=/webdex/expir/index/pmc -count=1000 -trecFormat=true > ' + KDdir + '/' + name + '.results')
bsubFile.write(
'\n/webdex/expir/script/trec_eval.9.0/trec_eval -q /webdex/expir/corpus/pmc/qrels.txt ' + KDdir + '/' + name + '.results >' + KDdir + '/' + name + '.stats')
bsubFile.close()
## function removes stopwords based on the basic english list in the stop_words module
## optionally pass a python list of different stopwords (example, a list of non medical terms)
def remove_stopwords(dictionary, swords='None'):
if not swords:
swords = stop_words.get_stop_words('english')
for key, value in dictionary.items():
newValue = []
for word in value.split():
if word not in swords:
newValue.append(word)
dictionary[key] = ' '.join(map(str, newValue))
return dictionary