-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
92 lines (72 loc) · 3.29 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import re
import nltk
import pandas as pd
import nltk as nlp
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score, f1_score, recall_score
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stopWords = set(stopwords.words('turkish'))
def pre_processing(text):
text = text.lower()
text = re.sub("[^abcçdefgğhıijklmnoöprsştuüvyz]"," ",text)
text = nltk.word_tokenize(text)
text = [word for word in text if not word in set(stopWords)]
lemma = nlp.WordNetLemmatizer()
text = [lemma.lemmatize(word) for word in text]
text = " ".join(text)
return text
df_train = pd.read_csv('train.csv', encoding='unicode_escape')
df_test = pd.read_csv('test.csv', encoding='unicode_escape')
df_train["clean_text"] = df_train["comment"].apply(lambda x: pre_processing(x))
df_test["clean_text"] = df_test["comment"].apply(lambda x: pre_processing(x))
X_train = df_train["clean_text"]
X_test = df_test["clean_text"]
y_train = df_train["Label"]
y_test = df_test["Label"]
#print("x_train", X_train.shape)
#print("x_test", X_test.shape)
#print("y_train", y_train.shape)
#print("y_test", y_test.shape)
LogisticRegressionModel = Pipeline([('tfidf', TfidfVectorizer()),('clf', LogisticRegression())])
LogisticRegressionModel.fit(X_train, y_train)
def plot_confusion_matrix(Y_test, Y_preds):
conf_mat = confusion_matrix(Y_test, Y_preds)
fig = plt.figure(figsize=(6,6))
plt.matshow(conf_mat, cmap=plt.cm.Blues, fignum=1)
plt.yticks(range(2), range(2))
plt.xticks(range(2), range(2))
plt.colorbar()
for i in range(2):
for j in range(2):
plt.text(i-0.2,j+0.1, str(conf_mat[j, i]), color='tab:red')
cv_scores = cross_val_score(LogisticRegressionModel, X_train, y_train, cv=10)
#print("CV average score: %.2f" % cv_scores.mean())
result = LogisticRegressionModel.predict(X_test)
cr = classification_report(y_test, result)
#print(cr)
#print('Train Accuracy : %.3f' % LogisticRegressionModel.score(X_train, y_train))
#print('Test Accuracy : %.3f' % LogisticRegressionModel.score(X_test, y_test))
y_pred = LogisticRegressionModel.predict(X_test)
#print(precision_score(y_test, y_pred, average='macro'), ": is the precision score")
#print(recall_score(y_test, y_pred, average='macro'), ": is the recall score")
#print(f1_score(y_test, y_pred, average='macro'), ": is the f1 score")
plot_confusion_matrix(y_test, LogisticRegressionModel.predict(X_test))
input_text = input("Lütfen analiz etmek istediğiniz metni girin: ")
clean_input_text = pre_processing(input_text)
prediction = LogisticRegressionModel.predict([clean_input_text])
proportion = LogisticRegressionModel.predict_proba([clean_input_text])
if prediction[0] == 1:
print("Girdi metni pozitif duygulu.")
print("Tahmin olasılığı:", proportion[0][1])
else:
print("Girdi metni negatif duygulu.")
print("Tahmin olasılığı:", proportion[0][0])