-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtraining_xgboost_model.py
227 lines (181 loc) · 11.4 KB
/
training_xgboost_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# This is the code to train the xgboost model with cross-validation for each unique room in the dataset.
# Models are dumped into ./models and results are dumped into two csv files in the current work directory.
import argparse
import json
import math
import os
import pickle
import warnings
from typing import Tuple
import numpy as np
import pandas as pd
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from imblearn.over_sampling import SMOTE
from numpy.random import RandomState
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_sample_weight
from tqdm import tqdm
from xgboost import DMatrix, cv
# Set up an argument parser to decide the metric function
parser = argparse.ArgumentParser()
parser.add_argument("--metric", choices=['R2', 'RMSE'], type=str, required=False, default='R2',
help="The evaluation metric you want to use to train the XGBoost model")
parser.add_argument("--log", choices=[0, 1, 100], type=int, required=False, default=0,
help="Whether to print out the training progress")
parser.add_argument("--SMOTE", choices=[0, 1], type=int, required=False, default=1, help="Whether use the SMOTE or not")
parser.add_argument("--SMOGN", choices=[0, 1], type=int, required=False, default=0, help="Whether use the SMOGN or not")
parser.add_argument("--SampleWeight", choices=[0, 1], type=int, required=False, default=0,
help="Whether use the sample weight")
args = parser.parse_args()
# Ignore all the warnings and set pandas to display every column and row everytime we print a dataframe
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
assert args.SMOTE != args.SMOGN, "Can't use SMOTE and SMOGN at the same time!"
# Load the data with a positive AC electricity consumption value, and drop the time data as we don't need them
data = pd.read_csv("summer_data_compiled.csv", index_col=0)
data = data[data.AC > 0].drop(['Time', 'Date', 'Hour'], axis=1).reset_index(drop=True)
# Create some directory to store the models and future analysis figures.
# log_folder_name = "Test_{}_{}".format(args.metric, datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
log_folder_name = "Test_R2_HYPEROPT"
log_folder_name = log_folder_name + "_SMOTE" if args.SMOTE else log_folder_name
log_folder_name = log_folder_name + "_SMOGN" if args.SMOGN else log_folder_name
log_folder_name = log_folder_name + "_SW" if args.SampleWeight else log_folder_name
previous_parameter_folder = "Test_R2_HYPEROPT"
assert log_folder_name != previous_parameter_folder, "Previous folder name exists"
if not os.path.exists('./{}/'.format(log_folder_name)):
os.mkdir('./{}'.format(log_folder_name))
os.mkdir('./{}/models/'.format(log_folder_name))
os.mkdir('./{}/trntst_models/'.format(log_folder_name))
# Define our evaluation functions
def RMSE(predt: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]:
truth_value = dtrain.get_label()
root_squard_error = math.sqrt(mean_squared_error(truth_value, predt))
return "RMSE", root_squard_error
def R2(predt: np.ndarray, dtrain: DMatrix) -> Tuple[str, float]:
truth_value = dtrain.get_label()
r2_value = r2_score(truth_value, predt)
return "R2", r2_value
def fobjective(space):
param_dict_tunning = {'max_depth': int(space['max_depth']),
'learning_rate': space['learning_rate'],
'colsample_bytree': space['colsample_bytree'],
'min_child_weight': int(space['min_child_weight']),
'reg_alpha': int(space['reg_alpha']),
'reg_lambda': space['reg_lambda'],
'subsample': space['subsample'],
'min_split_loss': space['min_split_loss'],
'objective': 'reg:squarederror'}
xgb_cv_result = xgb.cv(dtrain=data_matrix, params=param_dict_tunning, nfold=5,
early_stopping_rounds=30, as_pandas=True, num_boost_round=200,
seed=seed, metrics='rmse', maximize=False, shuffle=True)
return {"loss": (xgb_cv_result["test-rmse-mean"]).tail(1).iloc[0], "status": STATUS_OK}
eval_dict = {'RMSE': RMSE, 'R2': R2}
print("Start Training The Models")
# Create two dataframes to store the result during the training and after the training.
error_csv = pd.DataFrame(
columns=['room', 'train-{}-mean'.format(args.metric), 'train-{}-std'.format(args.metric), 'train-rmse-mean',
'train-rmse-std', 'test-{}-mean'.format(args.metric), 'test-{}-std'.format(args.metric), 'test-rmse-mean',
'test-rmse-std'])
prediction_csv = pd.DataFrame(columns=['room', 'observation', 'prediction'])
room_list = data['Location'].unique()
# ranging through all the rooms and do the training and cross-validation for each room.
for room in tqdm(room_list):
seed = 2030 + room
# Four rooms have low quality data and we delete them manually
if room == 309 or room == 312 or room == 826 or room == 917 or room == 1001:
continue
# We extract the data of particular room and run the SMOTE algorithm on it.
room_data = data[data.Location == room].drop(['Location'], axis=1).reset_index(drop=True)
if args.SMOTE:
# Label all the AC data by 0.75, all AC above 0.75 will be marked as 1, otherwise 0. Split into X and y
room_data['SMOTE_split'] = (room_data['AC'] > 0.75).astype('int')
X = room_data.drop(['SMOTE_split'], axis=1)
y = room_data['SMOTE_split']
# Run the SMOTE algorithm and retrieve the result.
model_smote = SMOTE(random_state=621, k_neighbors=3)
room_data_smote, smote_split = model_smote.fit_resample(X, y)
# concat the result from SMOTE and split the result into X and y for training.
room_data_smote = pd.concat([room_data_smote, smote_split], axis=1)
y = room_data_smote['AC']
X = room_data_smote.drop(['AC', 'SMOTE_split'], axis=1)
elif args.SMOGN:
if len(room_data) < 500:
room_data['SMOTE_split'] = (room_data['AC'] > 0.75).astype('int')
X = room_data.drop(['SMOTE_split'], axis=1)
y = room_data['SMOTE_split']
# Run the SMOTE algorithm and retrieve the result.
model_smote = SMOTE(random_state=621, k_neighbors=3)
room_data_smote, smote_split = model_smote.fit_resample(X, y)
# concat the result from SMOTE and split the result into X and y for training.
room_data_smote = pd.concat([room_data_smote, smote_split], axis=1)
y = room_data_smote['AC']
X = room_data_smote.drop(['AC', 'SMOTE_split'], axis=1)
else:
room_data = pd.read_csv('./SMOGN_processed/{}.csv'.format(room), index_col=0)
y = room_data['AC']
X = room_data.drop(['AC'], axis=1)
else:
y = pd.DataFrame(room_data['AC'].fillna(method='pad'))
X = room_data.drop(['AC'], axis=1).fillna(method='pad')
if args.SampleWeight:
class_sample = pd.cut(y, bins=15)
weight = compute_sample_weight(class_weight="balanced", y=class_sample)
X = X.to_numpy()
# Build another full data matrix for the built-in cross validation function to work.
data_matrix = DMatrix(data=X, label=y, weight=weight) if args.SampleWeight else DMatrix(data=X, label=y)
# Cross_validation with hyper-parameter tuning
space = {'max_depth': hp.quniform("max_depth", 3, 10, 1),
'learning_rate': hp.uniform("learning_rate", 0.1, 3),
'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 1),
'min_child_weight': hp.quniform("min_child_weight", 1, 20, 1),
'reg_alpha': hp.quniform("reg_alpha", 0, 100, 1),
'reg_lambda': hp.uniform("reg_lambda", 0, 2),
'subsample': hp.uniform("subsample", 0.5, 1),
'min_split_loss': hp.uniform("min_split_loss", 0, 9)}
if os.path.exists('./{}/models/{}_parameter.npy'.format(previous_parameter_folder, room)):
best_param_dict = np.load('./{}/models/{}_parameter.npy'.format(previous_parameter_folder, room),
allow_pickle=True).item()
np.save('./{}/models/{}_parameter.npy'.format(log_folder_name, room), best_param_dict)
else:
trials = Trials()
best_hyperparams = fmin(fn=fobjective, space=space, algo=tpe.suggest, max_evals=400, trials=trials,
rstate=RandomState(seed))
# setup our training parameters and a model variable as model checkpoint
best_param_dict = {'objective': 'reg:squarederror', 'max_depth': int(best_hyperparams['max_depth']),
'reg_alpha': best_hyperparams['reg_alpha'], 'reg_lambda': best_hyperparams['reg_lambda'],
'min_child_weight': best_hyperparams['min_child_weight'],
'colsample_bytree': best_hyperparams['colsample_bytree'],
'learning_rate': best_hyperparams['learning_rate'],
'subsample': best_hyperparams['subsample'],
'min_split_loss': best_hyperparams['min_split_loss']}
np.save('./{}/models/{}_parameter.npy'.format(log_folder_name, room), best_param_dict)
# Use the built-in cv function to do the cross validation, still with ten folds, this will return us the results.
xgb_cv_result = cv(dtrain=data_matrix, params=best_param_dict, nfold=5,
early_stopping_rounds=30, as_pandas=True, num_boost_round=200,
seed=seed, shuffle=True, feval=eval_dict[args.metric], maximize=True)
xgb_cv_result['room'] = room
error_csv.loc[len(error_csv)] = xgb_cv_result.loc[len(xgb_cv_result) - 1]
# Use one training_testing for ploting, and save both ground truth and prediction value into the dataframe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
d_train = DMatrix(X_train, label=y_train)
d_test = DMatrix(X_test, label=y_test)
watchlist = [(d_test, 'eval'), (d_train, 'train')]
xgb_model_train_test = xgb.train(params=best_param_dict, dtrain=d_train, num_boost_round=200, evals=watchlist,
verbose_eval=args.log, xgb_model=None, feval=eval_dict[args.metric], maximize=True)
prediction = np.array(xgb_model_train_test.predict(d_test)).tolist()
real = np.array(y_test).tolist()
prediction_csv.loc[len(prediction_csv)] = {'room': room, 'observation': json.dumps(real),
'prediction': json.dumps(prediction)}
# Dump the error dataframes into csv files.
error_csv.to_csv('./{}/error.csv'.format(log_folder_name), index=False)
prediction_csv.to_csv('./{}/prediction.csv'.format(log_folder_name), index=False)
# Develop a model using the whole orignial dataset, and save the model
xgb_model_full = xgb.train(params=best_param_dict, dtrain=data_matrix, num_boost_round=200, evals=watchlist,
verbose_eval=args.log, xgb_model=None, feval=eval_dict[args.metric], maximize=True)
# Save all the models we trained for future use
pickle.dump(xgb_model_train_test, open('./{}/trntst_models/{}.pickle.bat'.format(log_folder_name, room), 'wb'))
pickle.dump(xgb_model_full, open('./{}/models/{}.pickle.bat'.format(log_folder_name, room), 'wb'))
print("Training finished!")