-
Notifications
You must be signed in to change notification settings - Fork 0
/
IR_phase2.py
291 lines (217 loc) · 8.06 KB
/
IR_phase2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
!pip install hazm
!pip3 install parsivar
import json
import re
import math
import numpy as np
from copy import deepcopy
from itertools import islice
from google.colab import drive
from hazm import utils
from parsivar import Tokenizer, Normalizer, FindStems
drive.mount('/content/drive')
f = open("/content/drive/MyDrive/IR_data_news_12k.json")
news_dataset = json.load(f)
def stemming(tokens, docID, mode):
# mode=1 for stemming in news content, mode=2 for stemming in query for stemming in query
stemmed_token_list = []
my_stemmer = FindStems()
for token in tokens:
stemmed_token = my_stemmer.convert_to_stem(token)
stemmed_token = stemmed_token.split('&')
if mode == 1:
stemmed_token_list.append((stemmed_token[0], int(docID)))
elif mode == 2:
stemmed_token_list.append(stemmed_token[0])
return stemmed_token_list
def preprocessing(news_dataset):
tokenizer = Tokenizer()
normalizer = Normalizer(statistical_space_correction=True)
term_docID = []
news_title_url = {}
counter = 0
for docID in news_dataset:
if int(docID) % 1000 == 0:
print(docID)
content = news_dataset[docID]['content']
url = news_dataset[docID]['url']
title = news_dataset[docID]['title']
news_title_url[int(docID)] = [title, url]
# removing punctuations from content
new_content = re.sub(r'[^\w\s]', '', content)
# normalize new content
normalized_content = normalizer.normalize(new_content)
# getting the tokens(non-positional)
tokenized_content = tokenizer.tokenize_words(normalized_content)
# removing stopwords
stopwords = utils.stopwords_list()
for token in deepcopy(tokenized_content):
if token in stopwords:
tokenized_content.remove(token)
# stemming
term_docID.extend(stemming(tokenized_content, docID, 1))
return term_docID, news_title_url
term_docID, news_title_url = preprocessing(news_dataset)
# positions as int
def positional_indexing(term_docID):
positional_index = {}
cnt = 0
pre_docID = 0
position = 1
for item in term_docID:
token = item[0]
docID = item[1]
if pre_docID != docID:
position = 1
position_dic = {}
if token not in positional_index:
position_dic[docID] = 1
positional_index[token] = [1, position_dic]
else:
value = positional_index[token]
position_dic = value[1]
if docID not in position_dic:
rep = value[0]
rep += 1
value[0] = rep
position_dic[docID] = 0
position_dic[docID] += 1
positional_index[token] = value
position += 1
pre_docID = docID
cnt += 1
if cnt % 200000 == 0:
print(cnt)
# save the dictionary to check it
try:
positional_index_file = open('positional_index.txt', 'wt')
positional_index_file.write(str(positional_index))
positional_index_file.close()
except:
print("Unable to write to file")
return positional_index
positional_index = positional_indexing(term_docID)
def take(n, iterable):
# Return the first n items of the iterable as a list
return list(islice(iterable, n))
# build champions list
def build_champions_list(positional_index):
champions_list = {}
for term in positional_index:
postings = positional_index[term][1]
# finding the most relevant docs to term
sorted_postings = {}
for doc, term_freq in sorted(postings.items(), key=lambda item: item[1], reverse=True):
sorted_postings[doc] = term_freq
# returns list of dictionaries
most_relevant_docs_list = take(50, sorted_postings.items())
candidate_list = {}
for candidate_doc in most_relevant_docs_list:
candidate_list[candidate_doc[0]] = candidate_doc[1]
champion_list = [positional_index[term][0], candidate_list]
champions_list[term] = champion_list
# save the dictionary to check it
try:
champions_list_file = open('champions_list.txt', 'wt')
champions_list_file.write(str(positional_index))
champions_list_file.close()
except:
print("Unable to write to file")
return champions_list
champions_list = build_champions_list(positional_index)
def query_preprocessing(query_content):
tokenizer = Tokenizer()
normalizer = Normalizer(statistical_space_correction=True)
preprocessed_query = []
# removing punctuations from content
no_punc_query_content = re.sub(r'[^\w\s]', '', query_content)
# normalize new content
normalized_query_content = normalizer.normalize(no_punc_query_content)
# getting the tokens(non-positional)
tokenized_query_content = tokenizer.tokenize_words(normalized_query_content)
# remove stopwords
stopwords = utils.stopwords_list()
for token in deepcopy(tokenized_query_content):
if token in stopwords:
tokenized_query_content.remove(token)
# stemming
preprocessed_query.extend(stemming(tokenized_query_content, -1, 2))
return preprocessed_query
# calculating length of each document
def calculate_doc_length(positional_index, doc_num):
doc_length_array = np.zeros(doc_num)
# each doc is a vector of tf.idf weights
for t in positional_index:
for d in positional_index[t][1]:
tf = 1 + math.log10(positional_index[t][1][d])
idf = math.log10(doc_num / positional_index[t][0])
doc_length_array[int(d)] += (tf * idf)**2
doc_length_array = doc_length_array ** 0.5
return doc_length_array
def format_search_results(sorted_top_k, news_title_url):
counter = 0
if len((sorted_top_k)) == 0:
print('no results found')
return
else:
for top in sorted_top_k:
title = news_title_url[top[0]][0]
url = news_title_url[top[0]][1]
print('{}. document number: {} with cosine similarity:{}'.format(counter+1, top[0], top[1]))
print('url : {}'.format(url))
print('title : {}'.format(title))
counter += 1
if counter < len(sorted_top_k):
print('------------------------------------------------------')
def search_query(query, positional_index, doc_num, news_title_url):
doc_scores = np.zeros(doc_num)
doc_length_array = calculate_doc_length(positional_index, doc_num)
# how many of the query words does each doc have
doc_query_frequency = np.zeros(doc_num)
query_terms = {}
processed_query = query_preprocessing(query)
# calculating tf for query terms in the given query
for query_term in processed_query:
if query_term in query_terms:
term_query_wieght += 1
else:
query_terms[query_term] = 1
# index elimination
for term in query_terms:
if positional_index.get(term) is not None:
term_positings_list = positional_index[term][1]
for doc in term_positings_list:
doc_query_frequency[doc] += 1
else:
continue
for term in query_terms:
# checking if the query term exists in dictionary
if positional_index.get(term) is not None:
term_positings_list = positional_index[term][1]
term_query_weight = (1 + math.log10(query_terms[term]))
for doc in term_positings_list:
# only considering doc with at least n-1 query terms
if doc_query_frequency[doc] >= (len(query_terms) - 1):
tf = 1 + math.log10(positional_index[term][1][doc])
idf = math.log10(doc_num / positional_index[term][0])
term_doc_weigth = (tf * idf)
doc_scores[int(doc)] += term_query_weight * term_doc_weigth
else:
continue
else:
continue
normalized_score = {}
for doc in range(doc_num):
if doc_length_array[doc] != 0.0:
normalized_score[doc] = doc_scores[doc] / doc_length_array[doc]
candidates = {}
for doc, doc_score in sorted(normalized_score.items(), key=lambda item: item[1], reverse=True):
candidates[doc] = doc_score
top_k_docs = list(candidates.items())[:10]
final_top_k = []
for doc in top_k_docs:
if doc[1] != 0.0:
final_top_k.append(doc)
return final_top_k
sorted_top_k = search_query('تیم ملی فوتبال', champions_list, len(news_title_url), news_title_url)
format_search_results(sorted_top_k, news_title_url)