-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandomForest.py
89 lines (71 loc) · 3.08 KB
/
randomForest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
def create_dataset(gf, gt):
Xg, ys = [], []
lst = gf.shape[0] + gt.shape[0]
f_counter = 0
t_counter = 0
for i in range(0, lst):
rnd = random.randint(0, 100)
if (rnd >= 50 and f_counter < gf.shape[0]) or (rnd < 50 and t_counter >= gt.shape[0]):
z = gf.iloc[f_counter, :]
Xg.append(z)
ys.append(0)
f_counter += 1
elif(rnd < 50 and t_counter < gt.shape[0]) or (rnd >= 50 and f_counter >= gf.shape[0]):
z = gt.iloc[t_counter, :]
Xg.append(z)
ys.append(1)
t_counter += 1
return np.array(Xg), np.array(ys)
case_0_df = pd.read_csv('data-new/case-0.csv')
case_1_df = pd.read_csv('data-new/case-1.csv')
case_0_df.drop(['C? Mean-mir 20a',"Metastatic", "Surgery", "Chemotherapy", "Radiotherapy", "sex", "Age", "Ethinity", "smoking ", "Alchoholic", "Adiction", "Cardia", "A.Rumathoid", "Diabete","Pulonary"], axis=1)
case_1_df.drop(['C? Mean-mir 20a',"Metastatic", "Surgery", "Chemotherapy", "Radiotherapy", "sex", "Age", "Ethinity", "smoking ", "Alchoholic", "Adiction", "Cardia", "A.Rumathoid", "Diabete","Pulonary"], axis=1)
g_scale_column = ['C? Mean-mir let 7a','C? Mean-mir 221']
g_scaler = RobustScaler()
g_scaler = g_scaler.fit(case_0_df[g_scale_column])
case_0_df.loc[:, g_scale_column] = g_scaler.transform(case_0_df[g_scale_column].to_numpy())
g_scaler=g_scaler.fit(case_1_df[g_scale_column])
case_1_df.loc[:, g_scale_column] = g_scaler.transform(case_1_df[g_scale_column].to_numpy())
Xg, y = create_dataset(case_0_df, case_1_df)
print(y)
thesh = 10
train_data = Xg[0:thesh, :]
train_lable = y[0:thesh]
clf = RandomForestClassifier()
clf.fit(train_data, train_lable)
test_data = Xg[thesh:, :]
test_lable = y[thesh:]
preds = clf.predict(test_data)
y_pred = np.where(preds < 0.5, 0, 1)
auc = roc_auc_score(test_lable, preds)
print('AUC: %.2f' % auc)
fpr1, tpr1, thresholds1 = roc_curve(test_lable, preds)
optimal_idx1 = np.argmax(tpr1 - fpr1)
optimal_idx2 = np.argmin(np.sqrt((np.power((1-tpr1), 2))+ (np.power((1-(1-fpr1)), 2))))
# print("Roc", thresholds1[optimal_idx1])
# print("Roc", thresholds1[optimal_idx2])
plt.plot(fpr1, tpr1, label='ROC curve (area = {0:0.2f})'''.format(auc), color='black')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
tn, fp, fn, tp = metrics.confusion_matrix(test_lable, y_pred).ravel()
confusion_matrix = metrics.confusion_matrix(test_lable, y_pred)
print(confusion_matrix)
Specificity = tn / (tn + fp)
Sensitivity = tp / (fn + tp)
Accuracy = (tp+tn) / (tp+tn+fn+fp)
print("Specificity is ", Specificity)
print("Sensitivity is", Sensitivity)
print("Accuracy is", Accuracy)