forked from dwf/glmnet-python
-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathelastic_net.py
345 lines (318 loc) · 14.4 KB
/
elastic_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import numpy as np
from scipy.sparse import issparse
import _glmnet
from glmnet import GlmNet
class ElasticNet(GlmNet):
'''The elastic net: a multivariate linear model with both L1 and L2
regularizers.
This class implements the elastic net class of predictive models. These
models combine the classical ridge and lasso regularizers into a combined
penalty to the sum of squared residuals loss function. More specifically,
the loss function minimized by this model is:
L(\beta_0, \beta_1, ..., \beta_n) =
RSS(\beta_0, \beta_1, ..., \beta_n; X, y) +
\lambda * ((\alpha - 1)/2 * | \beta |_2 + \alpha * | \beta |_1)
where RSS is the usual residual sum of squares:
RSS(\beta_0, \beta_1, ..., \beta_n; X, y) = sum((\beta_i * X_i - y_i)^2)
'''
# TODO: Implement offsets.
def fit(self, X, y, col_names=None,
lambdas=None, weights=None, rel_penalties=None,
excl_preds=None, box_constraints=None):
'''Fit an elastic net model.
Arguments:
* X: The model matrix. A n_obs * n_preds array.
* y: The response. A n_obs array.
Optional Arguments:
* lambdas:
A user supplied list of lambdas, an elastic net will be fit for
each lambda supplied. If no array is passed, glmnet will generate
its own array of lambdas equally spaced on a logaritmic scale
between \lambda_max and \lambda_min.
* weights:
An n_obs array. Sample weights.
* rel_penalties:
An n_preds array. Relative panalty weights for the covariates. If
none is passed, all covariates are penalized equally. If an array
is passed, then a zero indicates an unpenalized parameter, and a 1
a fully penalized parameter. Otherwise all covaraites recieve an
equal penalty.
* excl_preds:
An n_preds array, used to exclude covaraites from the model. To
exclude predictors, pass an array with a 1 in the first position,
then a 1 in the i+1st position excludes the ith covaraite from
model fitting. If no array is passed, all covaraites in X are
included in the model.
* box_constraints:
An array with dimension 2 * n_obs. Interval constraints on the fit
coefficients. The (0, i) entry is a lower bound on the ith
covariate, and the (1, i) entry is an upper bound. These must
satisfy lower_bound <= 0 <= upper_bound. If no array is passed,
no box constraintes are allied to the parameters.
After fitting, the following attributes are set:
Private attributes:
* _n_fit_obs:
The number of rows in the model matrix X.
* _n_fit_params:
The number of columns in the model matrix X.
* _col_names:
Names for the columns in the model matrix. Used to display fit
coefficients.
* _out_n_lambdas:
The number of lambdas associated with non-zero models (i.e.
models with at least one none zero parameter estiamte) after
fitting; for large enough lambda the models will become zero in
the presense of an L1 regularizer.
* _intecepts:
A one dimensional array containing the intercept estiamtes for
each value of lambda. See the intercepts (no underscore)
property for a public version.
* _comp_coef:
The fit parameter estiamtes in a compressed form. This is a
matrix with each row giving the estimates for a single
coefficient for various values of \lambda. The order of the rows
does not correspond to the order of the coefficents as given in
the design matrix used to fit the model, this association is
given by the _p_comp_coef attribute. Only estaimtes that are
non-zero for some lambda are reported.
* _p_comp_coef:
A one dimensional integer array associating the coefficients in
_comp_coef to columns in the model matrix.
* _indices:
The same information as _p_comp_coef, but zero indexed to be
compatable with numpy arrays.
* _n_comp_coef:
The number of parameter estimates that are non-zero for some
value of lambda.
* _n_passes:
The total number of passes over the data used to fit the model.
* _error_flag:
Error flag from the fortran code.
Public Attributes:
* r_sqs:
An array of length _out_n_lambdas containing the r-squared
statistic for each model.
* out_lambdas:
An array containing the lambda values associated with each fit
model.
'''
self._check_if_unfit()
# Convert to arrays if native python objects
try:
if not issparse(X):
X = np.asanyarray(X)
y = np.asanyarray(y)
except ValueError:
raise ValueError("X and y must be either numpy arrays, or "
"convertable to numpy arrays."
)
# Grab the design info from patsy for later use, we are abbout to write
# over this object in some cases.
if hasattr(X, 'design_info'):
design_info = X.design_info
else:
design_info = None
# Make a copy if we are not able to overwrite X with its standardized
# version. Note that if X is not fortran contiguous, then it will be
# copied anyway.
if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
X = X.copy(order='F')
# Make a copy if we are not able to overwrite y with its standardized
# version.
if not self.overwrite_targ_ok:
y = y.copy()
# Validate all the inputs:
self._validate_matrix(X)
self._validate_inputs(X, y)
self._validate_lambdas(X, y, lambdas)
self._validate_weights(X, y, weights)
self._validate_rel_penalties(X, y, rel_penalties)
self._validate_excl_preds(X, y, excl_preds)
self._validate_box_constraints(X, y, box_constraints)
# Setup is complete, call into the extension module.
if not issparse(X):
(self._out_n_lambdas,
self._intercepts,
self._comp_coef,
self._p_comp_coef,
self._n_comp_coef,
self.r_sqs,
self.out_lambdas,
self._n_passes,
self._error_flag) = _glmnet.elnet(
self.alpha,
X,
y,
self.weights,
self.excl_preds,
self.rel_penalties,
self.box_constraints,
self.max_vars_all,
self.frac_lg_lambda,
self.lambdas,
self.threshold,
nlam=self.n_lambdas
)
else:
X.sort_indices()
# Fortran arrays are 1 indexed.
ind_ptrs = X.indptr + 1
indices = X.indices + 1
# Call
(self._out_n_lambdas,
self._intercepts,
self._comp_coef,
self._p_comp_coef,
self._n_comp_coef,
self.r_sqs,
self.out_lambdas,
self._n_passes,
self._error_flag) = _glmnet.spelnet(
self.alpha,
X.shape[0],
X.shape[1],
X.data,
ind_ptrs,
indices,
y,
self.weights,
self.excl_preds,
self.rel_penalties,
self.box_constraints,
self.max_vars_all,
self.frac_lg_lambda,
self.lambdas,
self.threshold,
nlam=self.n_lambdas
)
self._check_errors()
# The indexes into the predictor array are off by one due to fortran
# convention differing from numpys, this make them indexes into the the
# numpy array.
self._indices = np.trim_zeros(self._p_comp_coef, 'b') - 1
# Keep some model metadata.
self._n_fit_obs, self._n_fit_params = X.shape
# Create a list of column names for the fit parameters, these can be
# passed in, or attached to the matrix from patsy. If none are found
# we crate our own stupid ones.
if col_names != None:
self._col_names = col_names
elif design_info != None:
self._col_names = design_info.column_names
else:
self._col_names = [
'var_' + str(i) for i in range(self._n_fit_params)
]
@property
def _coefficients(self):
'''The fit model coefficients for each lambda.
A _n_comp_coef * _out_n_lambdas array containing the fit model
coefficients for each value of lambda.
'''
self._check_if_fit()
return self._comp_coef[:np.max(self._n_comp_coef),
:self._out_n_lambdas
]
def _max_lambda(self, X, y, weights=None):
'''Return the maximum value of lambda useful for fitting, i.e. that
which first forces all coefficients to zero.
The calculation is derived from the discussion in "Regularization
Paths for Generalized Linear Models via Coordinate Descent".
'''
if issparse(X):
return self._max_lambda_sparse(X, y, weights)
else:
return self._max_lambda_dense(X, y, weights)
def _max_lambda_dense(self, X, y, weights=None):
dot = self._get_dot(X)
if weights is None:
# Standardize X and then find the maximum dot product.
normfac = X.shape[0]
mu = X.sum(axis=0) / normfac
mu2 = (X*X).sum(axis=0) / normfac
sigma = np.sqrt(mu2 - mu*mu)
# Avoid divide by zero in constant case.
sigma[sigma == 0] = 1
X_scaled = (X - mu) / sigma
dots = dot(y, X_scaled) / normfac
else:
# Standardize X using the sample weights and then find the
# maximum weighted dot product.
normfac = np.sum(weights)
mu = dot(weights, X) / normfac
mu2 = dot(weights, X*X) / normfac
sigma = np.sqrt(mu2 - mu*mu)
# Avoid divide by zero in constant case.
sigma[sigma == 0] = 1
X_scaled = (X - mu) / sigma
y_wtd = y * weights
dots = dot(y_wtd, X_scaled) / normfac
# An alpha of zero (ridge) breaks the maximum lambda logic, the
# coefficients are never all zero - so we readjust to a small
# value.
alpha = self.alpha if self.alpha > .0001 else .0001
return np.max(np.abs(dots)) / alpha
def _max_lambda_sparse(self, X, y, weights=None):
'''To preserve the sparsity, we must avoid explicitly subtracting out
the mean of the columns.
'''
# Sparse dot
dot = self._get_dot(X)
# Calculate the dot product of y with X standardized, without
# destorying the sparsity of X. The calculations themselves do not
# differ from the dense case.
if weights is None:
normfac = X.shape[0]
E = lambda M: np.asarray(M.sum(axis=0)).ravel() / normfac
mu = E(X)
mu_2 = E(X.multiply(X))
sigma = np.sqrt(mu_2 - mu*mu)
sigma[sigma == 0] = 1.0
dots = 1/sigma * (dot(y, X) - mu * np.sum(y)) / normfac
else:
normfac = np.sum(weights)
E = lambda M: dot(weights, M) / normfac
mu = E(X)
mu_2 = E(X.multiply(X))
sigma = np.sqrt(mu_2 - mu*mu)
sigma[sigma == 0] = 1.0
y_wtd = y * weights
dots = 1/sigma * (dot(y_wtd, X) - mu * np.sum(y_wtd)) / normfac
# An alpha of zero (ridge) breaks the maximum lambda logic, the
# coefficients are never all zero - so we readjust to a small
# value.
alpha = self.alpha if self.alpha > .0001 else .0001
return np.max(np.abs(dots)) / alpha
def deviance(self, X, y, weights = None):
'''Calculate the normal deviance (i.e. sum of squared errors) for
every lambda. The model must already be fit to call this method.
'''
self._check_if_fit()
if weights is not None and weights.shape[0] != X.shape[0]:
raise ValueError("The weights vector must have the same length "
"as X."
)
y_hat = self.predict(X)
# Take the response y, and repeat it as a column to produce a matrix
# of the same dimensions as y_hat
y_stacked = np.tile(np.array([y]).transpose(), y_hat.shape[1])
if weights is None:
sq_residuals = (y_stacked - y_hat)**2
normfac = X.shape[0]
else:
w_stacked = np.tile(np.array([weights]).transpose(),
y_hat.shape[1]
)
sq_residuals = w_stacked * (y_stacked - y_hat)**2
normfac = np.sum(weights)
return np.apply_along_axis(np.sum, 0, sq_residuals) / normfac
def predict(self, X):
'''Produce model predictions from new data.'''
return self._predict_lp(X)
def plot_paths(self):
'''Plot parameter estiamte paths by log(lambda).'''
self._plot_path('elastic')
def __str__(self):
return self._str('elastic')
def describe(self, lidx=None):
return self._describe(lidx, 'elastic')