-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdocs_precompute.py
executable file
·106 lines (76 loc) · 2.28 KB
/
docs_precompute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pickle
import os
import math
from utils import *
from pathlib import Path
from preprocess import pre_process
""" Get all of the documents (problem statements) from /problems/*.txt
returns a list containing all docs """
BASE_DIR = Path(__file__).resolve().parent
PROBS_DIRS = os.path.join(BASE_DIR, 'probs')
def get_docs():
dir = PROBS_DIRS
documents = []
file_dir = os.listdir(dir)
file_dir.sort()
for file in file_dir:
with open(dir + '/' + file, 'rb') as f:
cor = f.read()
documents.append(cor.decode('utf8'))
f.close()
return documents
"""
Calculate tf-idf of all docs together, and store the details in the permanant storage
This is done because we do not have the resources to re-compute everything """
def calculate_tf_idf_docs():
documents = get_docs()
#preprocessed documents
processed_docs = []
for docs in documents:
processed_docs.append(pre_process(docs))
documents = processed_docs
#extract a list of words in the problems statement
words = []
dfs = dict()
for docs in documents:
for word in docs:
if word not in words:
words.append(word)
print(len(words))
for docs in documents:
tempdfs = set()
for word in docs:
tempdfs.add(word)
for word in tempdfs:
if not word in dfs:
dfs[word] = 1
else:
dfs[word] = + 1
N = len(documents) + 1
print(N)
#generating n-dimensional vectors
documents_vector = []
for docs in documents:
doc_vector = []
for word in words:
#for calculating term frequency
tf = 0
for term in docs:
if term == word:
tf = tf + 1
#for calculating document frequency
idf = math.log(N/dfs[word])
tf_idf = tf*idf
doc_vector.append(tf_idf)
documents_vector.append(doc_vector)
#making a db
db = {}
db['words'] = words
db['N'] = N
db['documents_vector'] = documents_vector
db['dfs'] = dfs
#save data to storage
pickle_out = open('serial.txt', 'wb+')
pickle.dump(db, pickle_out)
pickle_out.close()
calculate_tf_idf_docs()