-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTextProcessing.py
254 lines (213 loc) · 11.3 KB
/
TextProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#=======================================================================================================
# Tafadzwa Pasipanodya
# Computer Science SYE
# Module provides file I/O functionality for a document summarizer
#=======================================================================================================
import nltk
import os
import math
import string
import sentence
class TextProcessing(object):
#----------------------------------------------------------------------------------------------------
# Method for file IO processes. Opens a file, Removes HTML tags and tokenizes the file
#
# Preconditions: 1.) file_path_and_name != None
# 2.) type(file_path_and_name) == str
# 3.) file_path_and_name is a valid file address in the file system
#
# Returns: 1. A list of sentence objects the sentence object has an empty sentence if file 404
#----------------------------------------------------------------------------------------------------
def processFile(self, file_path_and_name):
try:
# open file
f = open(file_path_and_name,'r')
text = f.read()
# remove HTML tags
text = nltk.clean_html(text.replace('\n', ''));
# segement data into a list of sentences
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
lines = sent_tokenizer.tokenize(text.strip())
# ensure names used are in their complete form
# text = self.use_full_names(lines)
text = lines
# convert sentences to list of words
sentences = []
porter = nltk.PorterStemmer()
# every sentence
for sent in lines:
OG_sent = sent[:]
sent = sent.strip().lower()
line = nltk.word_tokenize(sent)
# convert words to stemmed words before appending to list and returning
stemmed_sentence = [porter.stem(word) for word in line]
stemmed_sentence = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'"
and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmed_sentence)
# no empy sentences
if stemmed_sentence != []:
sentences.append(sentence.sentence(file_path_and_name, stemmed_sentence, OG_sent))
return sentences
# print error message if file not found
except IOError:
print 'Oops! File not found',file_path_and_name
return [sentence.sentence(file_path_and_name, [],[])]
#-------------------------------------------------------------------------------------------------------
# Method to process a document by replacing all versions of a name with the fullest version of the same
# name.
#
# Preconditions: 1. type(doc) == list-of-str. Each string is a sentence.
#
# Returns: a list-of-str. Each string is a sentence
#-------------------------------------------------------------------------------------------------------
def use_full_names(self, doc):
names = self.getNames(doc)
for i in range(len(doc)):
doc[i] = self.getLongName(doc[i], names)
return doc
#----------------------------------------------------------------------------------------------------------
# Method to get all the named entities in a document
#
# Preconditions: 1. type(doc) == list-of-str. This document is a list of sentences. A sentence is a string
#
# Returns: a list-of-str. Each sting in our list is a full version of a name
#----------------------------------------------------------------------------------------------------------
def getNames(self, doc):
# join sentences into one long string and split it into a list of words
doc = ' '.join(doc).split()
# load the stanford named entity classifier
st = nltk.tag.stanford.NERTagger('C:/Users/tmpasi10/Desktop/ner/classifiers/english.all.3class.distsim.crf.ser.gz',\
'C:/Users/tmpasi10/Desktop/ner/stanford-ner.jar')
# get the nouns
tags = st.tag(doc)# just a list of names. each name is a tupple
doc = ' '.join(doc)
names = []
flag1 = False # is this a 3 part name eg John Peter Smith or Elizabeth Stella Doe
# build a list of known complete names
for i in range(1, len(tags)):
tag1 = tags[i-1]
tag2 = tags[i]
if i+1 < len(tags):
tag3 = tags[i+1]
if tag1[1] == 'PERSON' and tag2[1] == 'PERSON' and tag3[1] =='PERSON':
name = tag1[0] + ' ' + tag2[0] + ' ' + tag3[0]
if doc.find(name) > -1:
names.append(name)
i = i + 3
flag1 = True
if tag1[1] == 'PERSON' and tag2[1] == 'PERSON' and not flag1 and i<len(tags):
name = tag1[0] + ' ' + tag2[0]
if doc.find(name) > -1:
names.append(name)
i = i + 2
else:
i = i + 1
return names
#---------------------------------------------------------------------------------------------------------
# Method to replace all shortened versions of a name with their original long version
#
# Preconditions: 1. type(sentence) = list-of-str and type(names) = list-of-str
# 2. sentence != None and names != None
#
# Returns: a string. The sentence is joined into a string and returned after name replacement operations
# are completed
#---------------------------------------------------------------------------------------------------------
def getLongName(self, sentence, names):
sentence = sentence.split(" ")
i = 0
while i < len(sentence):
word1 = sentence[i]
for name in names:
flag = False
# check 2 words at a time
if i+1 != len(sentence):
word2 = sentence[i+1]
_2words = word1 + ' ' + word2
if self.begins_or_ends_with(_2words, name) and _2words != name:
if i == len(sentence)-2:
print sentence[i-1] + ' ' +_2words, name
sentence[i] = name
sentence = sentence[:i] + [name]
flag = True
else:
temp = _2words + ' ' + sentence[i+2]
if temp != name and temp[:len(temp)-1] != name:
sentence = sentence[:i] + [name] + sentence[i+2:]
flag = True
# check one word at a time
if self.begins_or_ends_with(word1, name) and not flag:
if i == len(sentence)-1:
sentence[i] = name
else:
if sentence[i+1] != name.split(" ")[1]:
sentence[i] = name
i +=1
return ' '.join(sentence)
#---------------------------------------------------------------------------------------------------------
# Method to check whether a word is part of the begining or ending of a recognized name
#
# Preconditions: 1. type(word) == str and type(name) == str
# word is always any word from a sentence. name is a complete version of a name, eg
# word = 'Jane', name = 'Jane Doe'
#
# Returns: a Boolean
#---------------------------------------------------------------------------------------------------------
def begins_or_ends_with(self, word, name):
return name[:len(word)] == word or name[len(name)-len(word):] == word
#--------------------------------------------------------------------------------------------------------
# Method to get a document's file path
#
# Preconditions: 1. file_name refers to a document in the same parent directory as TextProcessing.py
# i.e, file_name is one level deeper than TextProcessing.py in the directory path hierachy
# 2. type(file_name) == str and file_name != None
#
# Returns: a string; the filepath to the file file_name
#--------------------------------------------------------------------------------------------------------
def get_file_path(self, file_name):
for root, dirs, files in os.walk(os.getcwd()):
for name in files:
if name == file_name:
return os.path.join(root,name)
print "Error! file was not found!!"
return ""
#--------------------------------------------------------------------------------------------------------
# Method to get all file names from a directory
#
# Prenditions: 1.) path is the directory path in string format
# 2.) path is not None
#
# Returns: a list of all file paths in a directory
#--------------------------------------------------------------------------------------------------------
def get_all_files(self, path = None):
retval = []
# use current directory if no path given
if path == None:
path = os.getcwd()
# get all files in the given directory
for root, dirs, files in os.walk(path):
for name in files:
# make sure we arent considering code files as data files
if name != "DocSimilarity.py" and name !="MMR.py" and name != "TextProcessing.py" \
and name != "MMR_Summarizer.pyc" and name != "DocumentScoring.py" and name != "sentence.pyc"\
and name != "TextProcessing.pyc" and name != "TextProcessing.pyc" and name != "test.py" \
and name != "MMR_Summarizer.py" and name != "Main.py" and name !="sentence.py" \
and name != "sentence.pyc" and name !="DocSimilarity.pyc" and name != 'LexRankSummarizer.py'\
and name != 'LexRankSummarizer.pyc' and name != "DocumentScoring.pyc":
retval.append(os.path.join(root,name))
return retval
#--------------------------------------------------------------------------------------------------------
# Method to open all documents in a given directory
#
# Preconditions: path is the directory for a cluster
#
# Returns: a list of sentence objects #please check sentence object documentation#
#--------------------------------------------------------------------------------------------------------
def openDirectory(self, path=None):
# get a list of all file paths
file_paths = self.get_all_files(path)
# initialize our list of sentence objects
sentences = []
# open all files
for file_path in file_paths:
# build a list of sentence objects
sentences = sentences + self.processFile(file_path)
return sentences