-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathTest_module_retrain_model_test_CLEANED.py
169 lines (110 loc) · 4.23 KB
/
Test_module_retrain_model_test_CLEANED.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/python
# coding: utf-8
# ### Importing Libraries
# In[1]:
#main_file = 'Naive_Bayes_Classifer_main_training_file_test_CLEANED.py'
import sklearn
import string
import pandas as pd, numpy as np
import PyPDF2
import os, pickle
from time import time
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
# In[2]:
t0 = time()
testfiles=[]
path=str(input("Enter test directory path: \n"))
try:
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".pdf"):
testfiles.append(os.path.join(root, file))
# print(testfiles)
testfile_content = []
for file in testfiles:
content_data= ""
PDF_fileObj2 = open(file, 'rb')
pdfReader = PyPDF2.PdfFileReader(PDF_fileObj2)
for i in range(0 , pdfReader.numPages):
pageObj = pdfReader.getPage(i)
if i <=3:
content_text = pageObj.extractText()
content_data += content_text
testfile_content.append(content_data)
test_data = pd.DataFrame(testfile_content)
test_data.columns = [['TEXT']]
test_data.info()
except Exception as e:
print("Invalid Path!\nPlease enter correct path.")
# In[3]:
#Keywords file
Section_keywords = pd.read_excel("") #Keywords filepath
Keywords_df = Section_keywords.copy().fillna(0)
Keywords_df = Keywords_df.drop(Keywords_df.columns[:3],axis=1)
keyword_list = []
for col in Keywords_df.columns:
for val in Keywords_df[col]:
if val != 0:
keyword_list.append(val.lower())
keyword_list_unique = set(keyword_list)
# ### Cleaning & Standardising test document
# In[4]:
for i in range(0,len(test_data['TEXT'])):
test_data.iloc[i]['TEXT'] = test_data.iloc[i]['TEXT'].str.lower().replace("\n"," ").replace("\t"," ").str.strip(" ")
for i in range(len(test_data)):
temp = " ".join([w for w in test_data.iloc[i]])
test_data.iloc[i] = " ".join([w for w in temp.split(" ") if w in keyword_list_unique])
test_data
# ### load the model & vocab
# In[5]:
import pickle
model_path = "" #Model_filepath
vocab = "" #Vocabulary_filepath
loaded_vectorizer = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open(vocab, "rb")))
model_loaded = pickle.load(open(model_path, 'rb'))
# ### Prediction
# In[6]:
predicted_labels = []
for i in range(len(test_data)):
test_cv = loaded_vectorizer.transform(test_data.iloc[i])
predicted_labels.append(model_loaded.predict(test_cv.toarray()))
print(" Filename: {}\t, Index:{}\t, Predicted Label:{}".format(testfiles[i], i, model_loaded.predict(test_cv.toarray())))
# print("Predicted Label: " , model_loaded.predict(test_cv.toarray()))
# ### Taking Feedback as Input
## Feedback section
get_labels = str(input("Enter Index and Correct Label seperated by comma:\n"))
get_labels_list = get_labels.split(",")
index = int(get_labels_list[0])
label = [get_labels_list[1]]
# In[8]:
predicted_labels_copy = predicted_labels
predicted_labels_copy.insert(index,label)
predicted_labels_copy.remove(predicted_labels_copy[index+1])
print([str(val) for val in predicted_labels_copy])
# ### Taking the data for label
# In[9]:
feedback_data = test_data.iloc[[index]]
feedback_data.index=[0]
# print(feedback_data)
# ### Re-Training data
# In[53]:
Retraining_data_path = "" #Retraining data directory path
retrain_df = pd.concat([feedback_data,pd.DataFrame([label],columns=["LABEL"])],axis=1)
retrain_df.columns=[['TEXT','LABEL']]
print(retrain_df)
retrain_df.to_excel(os.path.join(Retraining_data_path,"Retrain_data_" +str(str(time()).split(".")[0]) + ".xlsx"))
# ### Re-Training the Model
# In[63]:
import glob
retrain_data = [pd.read_excel(file,index_col=[0]) for file in glob.glob("")] #Collecting all training data from excel files
full_retrain_data = pd.concat(retrain_data, ignore_index=1)
full_retrain_data.dropna(inplace=True)
# In[ ]:
text_data_df = pd.read_excel("",index_col=[0]) #Training data filepath
# In[67]:
full_data=pd.concat([text_data_df,full_retrain_data])
full_data.to_excel(os.path.join("")) #Main Training + Feedback data filepath
print(full_data)
os.system("python """) #Specify Main code filepath
print("Time Taken: %0.3fs" % (time() - t0)) #Timer