-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
92 lines (69 loc) · 2.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import RNN, SimpleRNN, Dense
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
class testModel():
def __init__(self, nbCategories=None, verbose=False):
self.verbose = verbose
self.nbCategories = nbCategories
self.paddingLength = 150
self.maxNumberWords = 1000
self.tokenizer = text.Tokenizer(num_words=self.maxNumberWords)
self.buildModel()
def preprocess(self, x):
# Inits tokenizer
self.tokenizer.fit_on_texts(x)
# Turns word sentences to word sequences
sequences = self.tokenizer.texts_to_sequences(x)
sequences = sequence.pad_sequences(sequences, self.paddingLength)
return sequences
def preprocessLabels(self, labels):
return to_categorical(labels, num_classes=self.nbCategories)
def buildModel(self):
self.model = Sequential()
self.model.add(Embedding(1000, 64,
input_length=self.paddingLength, trainable=False))
self.model.add(SimpleRNN(32))
self.model.add(Dense(self.nbCategories))
self.model.compile(
loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
def train(self, x, y, epochs=5):
self.model.fit(x, y, shuffle='batch', epochs=epochs)
def evaluate(self, x, y):
return self.model.evaluate(x, y)
def predict(self, x):
return self.model.predict(x)
if __name__ == '__main__':
# All variables
verbose = True
nbCategories = 52
epochs = 5
dataFolder = './challenge_data'
xPath = os.path.join(dataFolder, 'input_train.csv')
yPath = os.path.join(dataFolder, 'challenge_output_data_training_file_predict_the_expected_answer.csv')
# Model creation
model = testModel(nbCategories=nbCategories, verbose=verbose)
# Loading, parsing and spliting training and testing data
x = pd.read_csv(xPath, delimiter=';', usecols=[1]).values.ravel()
y = pd.read_csv(yPath, delimiter=';', usecols=[1]).values.ravel()
y = model.preprocessLabels(y)
xTrain, xTest, yTrain, yTest = train_test_split(
x, y, test_size=0.2, random_state=42)
xTrain = model.preprocess(xTrain)
xTest = model.preprocess(xTest)
# Training model
model.train(xTrain, yTrain, epochs=epochs)
# %% Testing model
loss = model.evaluate(xTest, yTest)
prediction = model.predict(xTest)
predictionCategories = np.argmax(prediction, axis=1)
yTestCategories = np.argmax(yTest, axis=1)
accuracy = 100 * sum([predictionCategories[i] == yTestCategories[i] for i in range(len(yTestCategories))]) / len(yTestCategories)
print('Accuracy: {:.2f} %\nLoss: {}'.format(accuracy, str(loss)))