-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
218 lines (166 loc) · 8.12 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import sklearn.preprocessing as pre
###################################################################################
############################ PREP DATA FOR MODELING ###############################
###################################################################################
def scale_data(train, validate, test, target):
'''
Takes in train, validate, test and the target variable.
Returns df with new columns with scaled data for the numeric
columns besides the target variable
'''
scale_features=list(train.select_dtypes(include=np.number).columns)
scale_features.remove(target)
train_scaled = train.copy()
validate_scaled = validate.copy()
test_scaled = test.copy()
minmax = pre.MinMaxScaler()
minmax.fit(train[scale_features])
train_scaled[scale_features] = pd.DataFrame(minmax.transform(train[scale_features]),
columns=train[scale_features].columns.values).set_index([train.index.values])
validate_scaled[scale_features] = pd.DataFrame(minmax.transform(validate[scale_features]),
columns=validate[scale_features].columns.values).set_index([validate.index.values])
test_scaled[scale_features] = pd.DataFrame(minmax.transform(test[scale_features]),
columns=test[scale_features].columns.values).set_index([test.index.values])
return train_scaled, validate_scaled, test_scaled
def get_dumdum(train, validate, test, cols_to_encode):
'''
Takes in a dataframe and creates dummy variables for each
categorical variable.
'''
dummy_train = pd.get_dummies(train[cols_to_encode], dummy_na=False)
train = pd.concat([train, dummy_train], axis=1)
dummy_validate = pd.get_dummies(validate[cols_to_encode], dummy_na=False)
validate = pd.concat([validate, dummy_validate], axis=1)
dummy_test = pd.get_dummies(test[cols_to_encode], dummy_na=False)
test = pd.concat([test, dummy_test], axis=1)
return train, validate, test
def pre_prep(train, validate, test, cols_to_encode, target):
'''
Takes in train, validate, test, cols_to_encode, target and
returns train, validate, test, with categorical features encoded
and numeric features scaled.
'''
for col in cols_to_encode:
train[col] = train[col].astype('category')
validate[col] = validate[col].astype('category')
test[col] = test[col].astype('category')
train, validate, test = scale_data(train, validate, test, target)
train, validate, test = get_dumdum(train, validate, test, cols_to_encode)
return train, validate, test
def prep_for_model(train, validate, test, target, drivers):
'''
Takes in train, validate, and test data frames, the target variable,
and a list of the drivers/features we want to model
It splits each dataframe into X (all variables but target variable)
and y (only target variable) for each data frame
'''
X_train = train[drivers]
y_train = train[target]
X_validate = validate[drivers]
y_validate = validate[target]
X_test = test[drivers]
y_test = test[target]
return X_train, y_train, X_validate, y_validate, X_test, y_test
###################################################################################
################### MODEL EVALUATION ON TRAIN AND VALIDATE DATA ###################
###################################################################################
def decision_tree_results(X_train, y_train, X_validate, y_validate):
'''
Takes in train and validate data and returns decision tree model results
'''
# create classifier object
tree = DecisionTreeClassifier(max_depth=10, random_state=27)
#fit model on training data
tree.fit(X_train, y_train)
#run on train and validate
in_sample_accuracy = tree.score(X_train, y_train)
out_of_sample_accuracy = tree.score(X_validate, y_validate)
#calculate the difference between the two
acc_diff = out_of_sample_accuracy - in_sample_accuracy
return in_sample_accuracy, out_of_sample_accuracy, acc_diff
def random_forest_results(X_train, y_train, X_validate, y_validate):
'''
Takes in train and validate data and returns random forest model results
'''
# create classifier object
rf = RandomForestClassifier(max_depth=10, min_samples_leaf = 12, random_state=27)
#fit model on training data
rf.fit(X_train, y_train)
#run on train and validate
in_sample_accuracy = rf.score(X_train, y_train)
out_of_sample_accuracy = rf.score(X_validate, y_validate)
#calculate the difference between the two
acc_diff = out_of_sample_accuracy - in_sample_accuracy
return in_sample_accuracy, out_of_sample_accuracy, acc_diff
def log_results(X_train, y_train, X_validate, y_validate):
'''
Takes in train and validate data and returns logistic regression model results
'''
# create classifier object
logit = LogisticRegression(random_state=27)
#fit model on training data
logit.fit(X_train, y_train)
##run on train and validate
in_sample_accuracy = logit.score(X_train, y_train)
out_of_sample_accuracy = logit.score(X_validate, y_validate)
#calculate the difference between the two
acc_diff = out_of_sample_accuracy - in_sample_accuracy
return in_sample_accuracy, out_of_sample_accuracy, acc_diff
def compare_models(X_train, y_train, X_validate, y_validate):
'''
Takes in X_train, y_train, X_validate, y_validate and returns a df
of results for the models
'''
# create the metric_df as a blank dataframe
metric_df = pd.DataFrame()
in_sample_accuracy, out_of_sample_accuracy, acc_diff = decision_tree_results(X_train, y_train, X_validate, y_validate)
metric_df = make_metric_df(in_sample_accuracy, out_of_sample_accuracy, acc_diff, metric_df, 'Decision Tree')
in_sample_accuracy, out_of_sample_accuracy, acc_diff = random_forest_results(X_train, y_train, X_validate, y_validate)
metric_df = make_metric_df(in_sample_accuracy, out_of_sample_accuracy, acc_diff, metric_df, 'Random Forest')
in_sample_accuracy, out_of_sample_accuracy, acc_diff = log_results(X_train, y_train, X_validate, y_validate)
metric_df = make_metric_df(in_sample_accuracy, out_of_sample_accuracy, acc_diff, metric_df, 'Logistic Regression')
return metric_df
def make_metric_df(in_sample_accuracy, out_of_sample_accuracy, acc_diff, metric_df, model_name):
'''
Takes in in_sample_accuracy, out_of_sample_accuracy, acc_diff, and a df
returns a df of accuracy score for the model on train and validate
and difference between the two
'''
if metric_df.size ==0:
metric_df = pd.DataFrame(data=[
{
'model': model_name,
f'Train Accuracy': in_sample_accuracy,
f'Validate Accuracy': out_of_sample_accuracy,
f'Difference': acc_diff
}])
return metric_df
else:
return metric_df.append(
{
'model': model_name,
f'Train Accuracy': in_sample_accuracy,
f'Validate Accuracy': out_of_sample_accuracy,
f'Difference': acc_diff,
}, ignore_index=True)
def best_model_comparison(X_train, y_train, X_validate, y_validate, X_test, y_test):
'''
Takes in train, validate and test data and returns random forest model results
'''
# create classifier object
rf = RandomForestClassifier(max_depth=10, min_samples_leaf = 12, random_state=27)
#fit model on training data
rf.fit(X_train, y_train)
results = pd.DataFrame(data=[
{
"Train Accuracy": rf.score(X_train, y_train),
"Validate Accuracy": rf.score(X_validate, y_validate),
"Test Accuracy": rf.score(X_test, y_test)
}])
return results