Skip to content

Commit

Permalink
Sparse matrix suppot for Logistic Net.
Browse files Browse the repository at this point in the history
  • Loading branch information
madrury committed Jul 9, 2014
1 parent 906fbf6 commit c82178b
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 50 deletions.
3 changes: 1 addition & 2 deletions glmnet/elastic_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def fit(self, X, y,
* out_lambdas: An array containing the lambda values associated with
each fit model.
'''
# Predictors and response
# Convert to arrays if native python objects
try:
if not issparse(X):
X = np.asanyarray(X)
Expand Down Expand Up @@ -222,7 +222,6 @@ def _max_lambda_sparse(self, X, y, weights=None):
E = lambda M: np.asarray(M.sum(axis=0)).ravel() / M.shape[0]
mu = E(X)
mu_2 = E(X.multiply(X))
print mu_2 - mu*mu
sigma = np.sqrt(mu_2 - mu*mu)
# Calculating the dot product of y with X standardized, without
# destorying the sparsity of X
Expand Down
50 changes: 27 additions & 23 deletions glmnet/glmnet.pyf
Original file line number Diff line number Diff line change
Expand Up @@ -678,40 +678,44 @@ python module _glmnet ! in
integer :: nin
real*8 dimension(nc,nt),depend(nc,nt) :: ans
end subroutine lmodval

subroutine splognet(parm,no,ni,nc,x,ix,jx,y,g,jd,vp,cl,ne,nx,nlam,flmin,ulam,thr,isd,intr,maxit,kopt,lmu,a0,ca,ia,nin,dev0,dev,alm,nlp,jerr) ! in :glmnet5:glmnet5.f
real*8 :: parm
integer, optional,check(shape(y,0)==no),depend(y) :: no=shape(y,0)
integer, optional,check(len(vp)>=ni),depend(vp) :: ni=len(vp)
integer, optional,check(shape(g,1)==nc),depend(g) :: nc=shape(g,1)
integer :: no
integer :: ni
integer :: nc
real*8 dimension(*) :: x
integer dimension(*) :: ix
integer dimension(*) :: jx
real*8 dimension(no,max(2,nc)),depend(nc) :: y
real*8 dimension(no,nc),depend(no) :: g
real*8 dimension(no,max(2,nc)),depend(no,nc) :: y
real*8 dimension(no,shape(y,1)),depend(no,y) :: g
integer dimension(*) :: jd
real*8 dimension(ni) :: vp
real*8 dimension(ni),depend(ni) :: vp
real*8 dimension(2,ni),depend(ni) :: cl
integer :: ne
integer, optional,check(shape(ca,0)==nx),depend(ca) :: nx=shape(ca,0)
integer, optional,check(len(ulam)>=nlam),depend(ulam) :: nlam=len(ulam)
integer optional,depend(ni,nx) :: ne=min(ni,nx)
integer :: nx
integer optional,check((flmin < 1.0 || len(ulam)<=nlam)),depend(flmin,ulam) :: nlam=len(ulam)
real*8 :: flmin
real*8 dimension(nlam) :: ulam
real*8 :: thr
integer :: isd
integer :: intr
integer :: maxit
integer :: kopt
integer :: lmu
real*8 dimension(nc,nlam),depend(nc,nlam) :: a0
real*8 dimension(nx,nc,nlam),depend(nc,nlam) :: ca
integer dimension(nx),depend(nx) :: ia
integer dimension(nlam),depend(nlam) :: nin
real*8 :: dev0
real*8 dimension(nlam),depend(nlam) :: dev
real*8 dimension(nlam),depend(nlam) :: alm
integer :: nlp
integer :: jerr
integer optional :: isd=1 ! Standardize predictors by default
integer optional :: intr=1 ! Include intercept by default
integer optional :: maxit=100000
integer optional :: kopt=0 ! Use Newton's by default

! Outputs
integer intent(out) :: lmu
real*8 intent(out),dimension(nc,nlam),depend(nlam,nc) :: a0
real*8 intent(out),dimension(nx,nc,nlam),depend(nx,nc,nlam) :: ca
integer intent(out),dimension(nx),depend(nx) :: ia
integer intent(out),dimension(nlam),depend(nlam) :: nin
real*8 intent(out),dimension(nlam),depend(nlam) :: dev0
real*8 intent(out),dimension(nlam),depend(nlam) :: dev
real*8 intent(out),dimension(nlam),depend(nlam) :: alm
integer intent(out) :: nlp
integer intent(out) :: jerr
end subroutine splognet

subroutine multsplstandard2(no,ni,x,ix,jx,w,ju,isd,intr,xm,xs,xv) ! in :glmnet5:glmnet5.f
integer, optional,check(len(w)>=no),depend(w) :: no=len(w)
integer, optional,check(len(ju)>=ni),depend(ju) :: ni=len(ju)
Expand Down
124 changes: 99 additions & 25 deletions glmnet/logistic_net.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
from scipy.sparse import issparse
from sklearn import preprocessing
import _glmnet
from glmnet import GlmNet
Expand Down Expand Up @@ -94,7 +95,15 @@ def fit(self, X, y,
'''
if weights is not None:
raise ValueError("LogisticNet cannot be fit with weights.")
X = np.asanyarray(X)
# Convert to arrays is native python objects
try:
if not issparse(X):
X = np.asanyarray(X)
y = np.asanyarray(y)
except ValueError:
raise ValueError("X and y must be wither numpy arrays, or "
"convertable to numpy arrays."
)
# Fortran expects an n_obs * n_classes array for y. If a one
# dimensional array is passed, we construct an appropriate widening.
y = np.asanyarray(y)
Expand All @@ -115,13 +124,14 @@ def fit(self, X, y,
# Make a copy if we are not able to overwrite X with its standardized
# version. Note that if X is not fortran contiguous, then it will be
# copied anyway.
if np.isfortran(X) and not self.overwrite_pred_ok:
if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
X = X.copy(order='F')
# The target array will usually be overwritten with its standardized
# version, if this is not ok, we should copy.
if np.isfortran(y) and not self.overwrite_targ_ok:
y = y.copy(order='F')
# Validate all the inputs:
self._validate_matrix(X)
self._validate_inputs(X, y)
self._validate_lambdas(X, y, lambdas)
self._validate_weights(X, y, weights)
Expand All @@ -130,29 +140,62 @@ def fit(self, X, y,
self._validate_box_constraints(X, y, box_constraints)
self._validate_offsets(X, y, offsets)
# Setup is complete, call the wrapper
(self._out_n_lambdas,
self._intercepts,
self._comp_coef,
self._p_comp_coef,
self._n_comp_coef,
self.null_dev,
self.exp_dev,
self.out_lambdas,
self._n_passes,
self._error_flag) = _glmnet.lognet(self.alpha,
self.y_level_count,
X,
y,
self.offsets,
self.excl_preds,
self.rel_penalties,
self.box_constraints,
self.max_vars_all,
self.frac_lg_lambda,
self.lambdas,
self.threshold,
nlam=self.n_lambdas
)
if not issparse(X):
(self._out_n_lambdas,
self._intercepts,
self._comp_coef,
self._p_comp_coef,
self._n_comp_coef,
self.null_dev,
self.exp_dev,
self.out_lambdas,
self._n_passes,
self._error_flag) = _glmnet.lognet(
self.alpha,
self.y_level_count,
X,
y,
self.offsets,
self.excl_preds,
self.rel_penalties,
self.box_constraints,
self.max_vars_all,
self.frac_lg_lambda,
self.lambdas,
self.threshold,
nlam=self.n_lambdas
)
else:
ind_ptrs = X.indptr + 1
indices = X.indices + 1
(self._out_n_lambdas,
self._intercepts,
self._comp_coef,
self._p_comp_coef,
self._n_comp_coef,
self.null_dev,
self.exp_dev,
self.out_lambdas,
self._n_passes,
self._error_flag) = _glmnet.splognet(
self.alpha,
X.shape[0],
X.shape[1],
self.y_level_count,
X.data,
ind_ptrs,
indices,
y,
self.offsets,
self.excl_preds,
self.rel_penalties,
self.box_constraints,
self.max_vars_all,
self.frac_lg_lambda,
self.lambdas,
self.threshold,
nlam=self.n_lambdas
)
self._check_errors()
# Keep some model metadata
self._n_fit_obs, self._n_fit_params = X.shape
Expand Down Expand Up @@ -223,6 +266,12 @@ def _max_lambda(self, X, y, weights=None):
scale as the weights, which causes the normalization factor to drop
out of the equation.
'''
if issparse(X):
return self._max_lambda_sparse(X, y, weights)
else:
return self._max_lambda_dense(X, y, weights)

def _max_lambda_dense(self, X, y, weights=None):
if weights is not None:
raise ValueError("LogisticNet cannot be fit with weights.")
X_scaled = preprocessing.scale(X)
Expand All @@ -241,6 +290,31 @@ def _max_lambda(self, X, y, weights=None):
alpha = self.alpha if self.alpha > .0001 else .0001
return np.max(np.abs(dots)) / alpha

def _max_lambda_sparse(self, X, y, weights=None):
if weights is not None:
raise ValueError("LogisticNet cannot be fit with weights.")
# Sparse dot product
dot = self._get_dot(X)
# Calculation is modeled on weighted least squares
p = np.mean(y)
working_resp = np.log(p/(1-p)) + (y - p) / (p*(1 - p))
working_weights = p*(1 - p) / (X.shape[0])
# Sample expectataion value of the columns in a matrix
E = lambda M: np.asarray(M.sum(axis=0)).ravel() / M.shape[0]
mu = E(X)
mu_2 = E(X.multiply(X))
sigma = np.sqrt(mu_2 - mu*mu)
# Calculating the dot product of y with X standardized, without
# destorying the sparsity of X
y_wtd = working_resp * working_weights
dots = 1/sigma * (dot(y_wtd, X) - mu * np.sum(y_wtd))
normfac = np.sum(working_weights)
# An alpha of zero (ridge) breaks the maximum lambda logic, the
# coefficients are never all zero - so we readjust to a small
# value.
alpha = self.alpha if self.alpha > .0001 else .0001
return np.max(np.abs(dots)) / alpha

def predict(self, X):
'''Return model predictions on the probability scale.'''
return 1 / ( 1 + np.exp(self._predict_lp(X)) )
Expand Down

0 comments on commit c82178b

Please sign in to comment.