-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregressionFunctions.py
251 lines (177 loc) · 8.15 KB
/
regressionFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LassoLarsIC
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
"""
File contains:
1.Three different types of Linear Models
-Lass0 (L1 Penalty), Ridge(L2 Penanlty), and ElasticNet(L2, L1 Combo)
2. BIC/AIC criterion graphing using LassoLarsIC()
3. Graphing weight values from a LassoRegression Model
4. Function to get a new Pandas DF from a Lasso weight Threshold
Regression Functions:
performRidgeReg(X, y, cvfolds=5, impStrategy= 'mean', aLow=0, aHigh=1, numAlphas=30)
- Determines which alpha hyperparamter makes the best RidgeRegression and prints R^2 score
-Uses Hold out validation
-alphas in range alow to aHigh
-can change impuation strategy from mean
-Standardizes Data
performLassoReg(X, y, cvfolds=5, impStrategy= 'mean', aLow=0, aHigh=1, numAlphas=30)
- Determines which alpha hyperparamter makes the best LassoRegression and prints R^2 score
-Uses Hold out validation
-alphas in range alow to aHigh
-can change impuation strategy from mean
-Standardizes Data
performElasticReg(X, y, cvfolds=5, impStrategy= 'mean', numRatios=10, aLow=0, aHigh=1, numAlphas=10)
-Perform ElasticRegression using a combo of L1 and L2 regualarization
-Uses Hold out validation
-alphas in range alow to aHigh
-Number of different ratios to produce from 0-1 in numRatios
-can change impuation strategy from mean
-Standardizes Data
Model Improve/ Visualiztion Functions:
#taken, renamed, and briefly edited from sci-kit learn Documentation
testFitvsNumParms(X, y, impStrategy= 'mean')
-plots a graph with AIC and BIC showing optimal number of paramters with solid line
-can change impuation strategy from mean
showLassoParamWeights(X, y, alpha=.4, impStrategy='mean')
-used to show which weights go to zero from Lasso
-can change alpha: should test optimal first from performLassoReg()
-try removing the parameters with small weights from this graph in a Ridge Regression
getNewXfromLassoWeightThresh(X, y, alpha=.4, weightThresh=1, impStrategy='mean')
-Get new Pandas Df of X from a paramter weight Threshold in Lasso
- Should look at graph from showLassoParamWeights() to determine the threshold
"""
def performRidgeReg(X, y, folds=5, impStrategy= 'mean', aLow=0, aHigh=1, numAlphas=10):
#use hold out validation for analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
#create pipeline for Model testing/training
steps = [('imputation', Imputer(missing_values='NaN', strategy= impStrategy, axis=0)),
('scaler', StandardScaler()),
('ridgeReg', Ridge(random_state=2))]
pipeline = Pipeline(steps)
#create different alpha paramaters to test
stepsize = (aHigh - aLow) / numAlphas
alphas = np.arange(aLow, aHigh, stepsize)
param_grid = {'ridgeReg__alpha': alphas}
# Create the GridSearchCV
gm_cv = GridSearchCV(pipeline, param_grid, cv=folds)
#fit the Grid Search Cross Value Model
gm_cv.fit(X_train, y_train)
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print("Best Alpha: "+str(gm_cv.best_params_))
print("Tuned Ridge Reg R squared: "+str(r2))
print("Tuned Ridge Reg MSE: "+str(mse))
def performLassoReg(X, y, folds=5, impStrategy= 'mean', aLow=0, aHigh=1, numAlphas=10):
#use hold out validation for analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
#create pipeline for Model testing/training
steps = [('imputation', Imputer(missing_values='NaN', strategy= impStrategy, axis=0)),
('scaler', StandardScaler()),
('LassoReg', Lasso(random_state=2) )]
pipeline = Pipeline(steps)
#create different alpha paramaters to test
stepsize = (aHigh - aLow) / numAlphas
alphas = np.arange(aLow, aHigh, stepsize)
param_grid = {'LassoReg__alpha': alphas}
# Create the GridSearchCV
gm_cv = GridSearchCV(pipeline, param_grid, cv=folds)
#fit the Grid Search Cross Value Model
gm_cv.fit(X_train, y_train)
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print("Best Alpha: "+str(gm_cv.best_params_))
print("Tuned Lasso Reg R squared: "+str(r2))
print("Tuned Lasso Reg MSE: "+str(mse))
def performElasticReg(X, y, folds=5, impStrategy= 'mean', numRatios=10, aLow=0, aHigh=1, numAlphas=10):
#use hold out validation for analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
#create pipeline for Model testing/training
steps = [('imputation', Imputer(missing_values='NaN', strategy= impStrategy, axis=0)),
('scaler', StandardScaler()),
('elasticnet', ElasticNet(random_state=2))]
pipeline = Pipeline(steps)
#create different alpha and ratio paramaters to test
stepsize= 1/numRatios
l1_ratios = np.arange(0, 1, stepsize)
stepsize = (aHigh - aLow) / numAlphas
alphas = np.arange(aLow, aHigh, stepsize)
parameters = {'elasticnet__l1_ratio': l1_ratios,
'elasticnet__alpha': alphas}
# Create the GridSearchCV
gm_cv = GridSearchCV(pipeline, parameters, cv=folds)
#fit the Grid Search Cross Value Model
gm_cv.fit(X_train, y_train)
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print("Best L1 Ratio: "+str(gm_cv.best_params_))
print("Tuned Elastic Net R squared: "+str(r2))
print("Tuned Elastic Net MSE: "+str(mse))
def testFitvsNumParms(X, y, impStrategy='mean'):
#imputate missing values
imp= Imputer(missing_values='NaN', strategy=impStrategy, axis=0)
X= imp.fit_transform(X)
# normalize data as done by Lars to allow for comparison
X /= np.sqrt(np.sum(X ** 2, axis=0))
#LassoLarsIC: least angle regression with BIC / AIC criterion
model_bic = LassoLarsIC(criterion='bic', normalize=True)
t1 = time.time()
model_bic.fit(X, y)
t_bic = time.time() - t1
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(X, y)
plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('Information-criterion for model selection (training time %.3fs)'
% t_bic)
plt.show()
def showLassoParamWeights(X, y, alpha=.4, impStrategy='mean'):
#get column names
df_columns=X.columns
#fill NaNs
imp =Imputer(strategy=impStrategy)
X= imp.fit_transform(X)
#create lasso and normalize
lasso= Lasso(normalize=True)
lasso.fit(X, y)
lasso_coef = lasso.coef_
# Plot the coefficients
plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
plt.margins(0.02)
plt.show()
def getNewXfromLassoWeightThresh(X, y, alpha=.4, weightThresh=1, impStrategy='mean'):
# get column names
df_columns = X.columns
# fill NaNs
imp = Imputer(strategy=impStrategy)
Xnp = imp.fit_transform(X)
# create lasso and normalize and fit
lasso = Lasso(normalize=True)
lasso.fit(Xnp, y)
lasso_coefs = lasso.coef_
#create new X dataframe from weightThresh
mask= lasso_coefs>=weightThresh
return X.loc[:, mask]
#plots AIC vs BIC
# -taken from Sci-kit learn API
def plot_ic_criterion(model, name, color):
alpha_ = model.alpha_
alphas_ = model.alphas_
criterion_ = model.criterion_
plt.plot(-np.log10(alphas_), criterion_, '--', color=color,
linewidth=3, label='%s criterion' % name)
plt.axvline(-np.log10(alpha_), color=color, linewidth=3,
label='alpha: %s estimate' % name)
plt.xlabel('-log(alpha)')
plt.ylabel('criterion')