-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgb_baseline.py
227 lines (182 loc) · 7.83 KB
/
xgb_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# Grid search example using 5 fold validation
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
import time
import pandas as pd
df = pd.read_stata("ecart_data/nlp_clindiag_reason_data.dta")
import configparser
cfgfile = "ecart_data/config.cfg"
cfg = configparser.ConfigParser()
cfg.read(cfgfile)
train_csv = pd.read_csv("FINAL/val.csv",sep="|")
print(len(train_csv))
cfg_inputs = cfg.get('DATAPOINTS','inputs').split(", ")
cfg_labels = cfg.get('DATAPOINTS','labels').split(", ")
# Sample dataset
# X, y = your_data_here (X is the feature set and y is the target)
folder_path = "FINAL_remove_false_alarm/"
dev_df = pd.read_csv(folder_path+"all.csv", sep="|")
def read_data_from_csv(fn):
train_csv = pd.read_csv(fn, sep="|")
print(len(train_csv))
cfg_inputs = cfg.get('DATAPOINTS','inputs').split(", ")
cfg_labels = cfg.get('DATAPOINTS','labels').split(", ")
X, Y = [], []
for index, row in train_csv.iterrows():
print(" =========== ")
inputs = row[cfg_inputs]
labels = row[cfg_labels]
print(inputs)
X.append(inputs)
Y.append(labels.tolist()[22])
return X, Y
def extract_data_from_pd(df):
cfg_inputs = cfg.get('DATAPOINTS','inputs').split(", ")
cfg_labels = cfg.get('DATAPOINTS','labels').split(", ")
X, Y = [], []
for index, row in df.iterrows():
#print(" =========== ")
inputs = row[cfg_inputs]
labels = row[cfg_labels]
#print(inputs)
X.append(inputs)
Y.append(labels.tolist()[22])
return X, Y
train_X, train_Y = read_data_from_csv(folder_path+"all.csv")
## Remove the cat variables for now untill getting all the cat variables
selected_columns = ['age','sbp_impute', 'dbp_impute',
'ppi_impute', 'o2sat_impute', 'temp_c_impute', 'avpu_impute',
'albumin_impute', 'alk_phos_impute', 'anion_gap_impute',
'bili_total_impute', 'bun_impute', 'bun_cr_ratio_impute',
'calcium_impute', 'chloride', 'co2_impute', 'creatinine_impute',
'gluc_ser_impute', 'hb_impute', 'platelet_count_impute',
'potassium_impute', 'sgot_impute', 'sodium_impute',
'total_protein_impute', 'wbc_impute']
def grid_search_5_fold(input_X, input_Y,cv_num=5):
# with GPU Enhancement
start_time = time.time()
# Define the parameter grid
param_grid = {
'n_estimators': [50, 100, 250, 500],
'max_depth': [2, 5, 10, 15, 20],
'learning_rate': [0.005, 0.01, 0.05, 0.1],
'min_child_weight': [1, 2, 3]
}
# Instantiate an XGBoost classifier object
xgb_clf = xgb.XGBClassifier()
# GridSearchCV
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=cv_num, verbose=1)
# Fit the grid search to the data
grid_search.fit(input_X, input_Y)
end_time = time.time()
time_taken = (end_time - start_time) / 60
# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
print(f"Running time: {time_taken:.4f} mins")
return grid_search
def grid_search_5_fold_GPU(input_X, input_Y, cv_num=5):
# with GPU Enhancement
start_time = time.time()
# Define the parameter grid
param_grid = {
'n_estimators': [50, 100, 250, 500],
'max_depth': [2, 5, 10, 15, 20],
'learning_rate': [0.005, 0.01, 0.05, 0.1],
'min_child_weight': [1, 2, 3]
}
# Instantiate an XGBoost classifier object
xgb_clf = xgb.XGBClassifier(tree_method='gpu_hist',predictor='gpu_predictor',gpu_id=2)
# GridSearchCV
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=cv_num, verbose=1)
# Fit the grid search to the data
grid_search.fit(selected_X, new_Y)
end_time = time.time()
time_taken = (end_time - start_time) / 60
# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
print(f"Running time With GPU: {time_taken:.4f} mins")
return grid_search
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
def main_kfold_xgboost(data,k_num=5):
"""Input: data: pd.dataframe for all patients
Output: AUROC on 5-fold on all patients
"""
# Step 1: get all IDs, perform KFold split
kf = KFold(n_splits=5)
all_enc_ids = data['encounter_id']
print(f"TOTAL ENCOUNTER NUM: {len(all_enc_ids)}")
selected_columns = ['age','sbp_impute', 'dbp_impute',
'ppi_impute', 'o2sat_impute', 'temp_c_impute', 'avpu_impute',
'albumin_impute', 'alk_phos_impute', 'anion_gap_impute',
'bili_total_impute', 'bun_impute', 'bun_cr_ratio_impute',
'calcium_impute', 'chloride', 'co2_impute', 'creatinine_impute',
'gluc_ser_impute', 'hb_impute', 'platelet_count_impute',
'potassium_impute', 'sgot_impute', 'sodium_impute',
'total_protein_impute', 'wbc_impute']
test_enc_ids = {}
_ = 0 # batch index for k-fold split
batch_auroc = {}
batch_yproba = {}
# Step 2:
for train_index, test_index in kf.split(all_enc_ids):
# a. Perform KFold Split,
print(f"===== STARTING AT THE {_}TH BATCH OF K-FOLD ======")
train_df = dev_df.iloc[train_index]
print(f"training set size: {train_df.shape}")
test_df = dev_df.iloc[test_index]
print(f"test set size: {test_df.shape}")
test_enc_ids[_] = test_df["encounter_id"].tolist()
test_X, test_Y = extract_data_from_pd(test_df)
train_X, train_Y = extract_data_from_pd(train_df)
# b. for each split, perform grid search
new_X = pd.DataFrame(train_X)
selected_X = new_X[selected_columns]
new_Y = replace_nan_in_y(train_Y)
grid_search = grid_search_5_fold_GPU(selected_X, new_Y,cv_num=5)
# c. pick the best hyperparam and train the clf
best_params = grid_search.best_params_
#best_xgb_clf = xgb.XGBClassifier(**best_params, device="cuda")
best_params["tree_method"] = "gpu_hist"
best_xgb_clf_gpu = xgb.XGBClassifier(**best_params,predictor="gpu_predictor", gpu_id=1)
start_time = time.time()
best_xgb_clf_gpu.fit(selected_X, new_Y)
end_time = time.time()
time_taken_training = (end_time - start_time) / 60
print(f"Time Taken Training: {time_taken_training:.4f} mins")
# d. evaluate on test set
test_X = pd.DataFrame(test_X)
test_Y = replace_nan_in_y(test_Y)
test_selected_X = test_X[selected_columns]
print(f"XGB Test Input Shape {test_selected_X.shape}")
y_pred_proba = best_xgb_clf_gpu.predict_proba(test_selected_X)[:, 1]
auroc = roc_auc_score(test_Y, y_pred_proba)
batch_yproba[_] = y_pred_proba
batch_auroc[_] = auroc
print(f"Batch {_} AUROC: {auroc:.4f}")
_ +=1
return test_enc_ids, batch_auroc, batch_yproba
test_enc_ids, batch_auroc, batch_yproba = main_kfold_xgboost(dev_df)
## Draw the AUROC
import matplotlib.pyplot as plt
from sklearn import metrics
fpr, tpr, _ = metrics.roc_curve(all_y_golds, all_y_probs)
auc = metrics.roc_auc_score(all_y_golds, all_y_probs)
plt.plot(fpr, tpr, label="UW, auc="+str(auc))
plt.legend(loc=4)
plt.show()
## With perfect calibration
from sklearn.calibration import calibration_curve
true_prob, pred_prob = calibration_curve(all_y_golds, all_y_probs, n_bins=10)
#auc = metrics.roc_auc_score(all_y_golds, all_y_probs)
#plt.figure(figuresize=(8,8))
plt.plot(pred_prob, true_prob, marker='o', linewidth=1, label="Model")
plt.plot([0,1],[0,1], linestyle='--', label="Perfectly calibrated")
plt.xlabel('Pred Prob')
plt.ylabel('True Prob in each bin')
plt.title('Calibration curve')
#plt.plot(fpr, tpr, label="UW, auc="+str(auc))
plt.legend(loc=4)
plt.show()