Sparse matrix suppot for Logistic Net.

madrury · Jul 9, 2014 · c82178b · c82178b
1 parent 906fbf6
commit c82178b
Show file tree

Hide file tree

Showing 3 changed files with 127 additions and 50 deletions.
diff --git a/glmnet/elastic_net.py b/glmnet/elastic_net.py
@@ -80,7 +80,7 @@ def fit(self, X, y,
           * out_lambdas: An array containing the lambda values associated with
             each fit model.
         '''
-        # Predictors and response
+        # Convert to arrays if native python objects
         try:
             if not issparse(X):
                 X = np.asanyarray(X)
@@ -222,7 +222,6 @@ def _max_lambda_sparse(self, X, y, weights=None):
         E = lambda M: np.asarray(M.sum(axis=0)).ravel() / M.shape[0]
         mu = E(X)
         mu_2 = E(X.multiply(X))
-        print mu_2 - mu*mu
         sigma = np.sqrt(mu_2 - mu*mu)
         # Calculating the dot product of y with X standardized, without 
         # destorying the sparsity of X

diff --git a/glmnet/glmnet.pyf b/glmnet/glmnet.pyf
@@ -678,40 +678,44 @@ python module _glmnet ! in
             integer :: nin
             real*8 dimension(nc,nt),depend(nc,nt) :: ans
         end subroutine lmodval
+
         subroutine splognet(parm,no,ni,nc,x,ix,jx,y,g,jd,vp,cl,ne,nx,nlam,flmin,ulam,thr,isd,intr,maxit,kopt,lmu,a0,ca,ia,nin,dev0,dev,alm,nlp,jerr) ! in :glmnet5:glmnet5.f
             real*8 :: parm
-            integer, optional,check(shape(y,0)==no),depend(y) :: no=shape(y,0)
-            integer, optional,check(len(vp)>=ni),depend(vp) :: ni=len(vp)
-            integer, optional,check(shape(g,1)==nc),depend(g) :: nc=shape(g,1)
+            integer :: no
+            integer :: ni
+            integer :: nc
             real*8 dimension(*) :: x
             integer dimension(*) :: ix
             integer dimension(*) :: jx
-            real*8 dimension(no,max(2,nc)),depend(nc) :: y
-            real*8 dimension(no,nc),depend(no) :: g
+            real*8 dimension(no,max(2,nc)),depend(no,nc) :: y
+            real*8 dimension(no,shape(y,1)),depend(no,y) :: g
             integer dimension(*) :: jd
-            real*8 dimension(ni) :: vp
+            real*8 dimension(ni),depend(ni) :: vp
             real*8 dimension(2,ni),depend(ni) :: cl
-            integer :: ne
-            integer, optional,check(shape(ca,0)==nx),depend(ca) :: nx=shape(ca,0)
-            integer, optional,check(len(ulam)>=nlam),depend(ulam) :: nlam=len(ulam)
+            integer optional,depend(ni,nx) :: ne=min(ni,nx)
+            integer :: nx
+            integer optional,check((flmin < 1.0 || len(ulam)<=nlam)),depend(flmin,ulam) :: nlam=len(ulam)
             real*8 :: flmin
             real*8 dimension(nlam) :: ulam
             real*8 :: thr
-            integer :: isd
-            integer :: intr
-            integer :: maxit
-            integer :: kopt
-            integer :: lmu
-            real*8 dimension(nc,nlam),depend(nc,nlam) :: a0
-            real*8 dimension(nx,nc,nlam),depend(nc,nlam) :: ca
-            integer dimension(nx),depend(nx) :: ia
-            integer dimension(nlam),depend(nlam) :: nin
-            real*8 :: dev0
-            real*8 dimension(nlam),depend(nlam) :: dev
-            real*8 dimension(nlam),depend(nlam) :: alm
-            integer :: nlp
-            integer :: jerr
+            integer optional :: isd=1 ! Standardize predictors by default
+            integer optional :: intr=1 ! Include intercept by default
+            integer optional :: maxit=100000
+            integer optional :: kopt=0 ! Use Newton's by default
+
+	    ! Outputs
+            integer intent(out) :: lmu
+            real*8 intent(out),dimension(nc,nlam),depend(nlam,nc) :: a0
+            real*8 intent(out),dimension(nx,nc,nlam),depend(nx,nc,nlam) :: ca
+            integer intent(out),dimension(nx),depend(nx) :: ia
+            integer intent(out),dimension(nlam),depend(nlam) :: nin
+            real*8 intent(out),dimension(nlam),depend(nlam) :: dev0
+            real*8 intent(out),dimension(nlam),depend(nlam) :: dev
+            real*8 intent(out),dimension(nlam),depend(nlam) :: alm
+            integer intent(out) :: nlp
+            integer intent(out) :: jerr
         end subroutine splognet
+
         subroutine multsplstandard2(no,ni,x,ix,jx,w,ju,isd,intr,xm,xs,xv) ! in :glmnet5:glmnet5.f
             integer, optional,check(len(w)>=no),depend(w) :: no=len(w)
             integer, optional,check(len(ju)>=ni),depend(ju) :: ni=len(ju)

diff --git a/glmnet/logistic_net.py b/glmnet/logistic_net.py
@@ -1,4 +1,5 @@
 import numpy as np
+from scipy.sparse import issparse
 from sklearn import preprocessing
 import _glmnet
 from glmnet import GlmNet
@@ -94,7 +95,15 @@ def fit(self, X, y,
         '''
         if weights is not None:
             raise ValueError("LogisticNet cannot be fit with weights.")
-        X = np.asanyarray(X)
+        # Convert to arrays is native python objects
+        try:
+            if not issparse(X):
+                X = np.asanyarray(X)
+            y = np.asanyarray(y)
+        except ValueError:
+            raise ValueError("X and y must be wither numpy arrays, or "
+                             "convertable to numpy arrays."
+                  )
         # Fortran expects an n_obs * n_classes array for y.  If a one 
         # dimensional array is passed, we construct an appropriate widening. 
         y = np.asanyarray(y)
@@ -115,13 +124,14 @@ def fit(self, X, y,
         # Make a copy if we are not able to overwrite X with its standardized 
         # version. Note that if X is not fortran contiguous, then it will be 
         # copied anyway.
-        if np.isfortran(X) and not self.overwrite_pred_ok:
+        if not issparse(X) and np.isfortran(X) and not self.overwrite_pred_ok:
             X = X.copy(order='F')
         # The target array will usually be overwritten with its standardized
         # version, if this is not ok, we should copy.
         if np.isfortran(y) and not self.overwrite_targ_ok:
             y = y.copy(order='F')
         # Validate all the inputs:
+        self._validate_matrix(X)
         self._validate_inputs(X, y)
         self._validate_lambdas(X, y, lambdas)
         self._validate_weights(X, y, weights)
@@ -130,29 +140,62 @@ def fit(self, X, y,
         self._validate_box_constraints(X, y, box_constraints)
         self._validate_offsets(X, y, offsets)
         # Setup is complete, call the wrapper
-        (self._out_n_lambdas,
-        self._intercepts,
-        self._comp_coef,
-        self._p_comp_coef,
-        self._n_comp_coef,
-        self.null_dev,
-        self.exp_dev,
-        self.out_lambdas,
-        self._n_passes,
-        self._error_flag) = _glmnet.lognet(self.alpha, 
-                                           self.y_level_count,
-                                           X,
-                                           y, 
-                                           self.offsets,
-                                           self.excl_preds, 
-                                           self.rel_penalties,
-                                           self.box_constraints,
-                                           self.max_vars_all, 
-                                           self.frac_lg_lambda, 
-                                           self.lambdas,
-                                           self.threshold, 
-                                           nlam=self.n_lambdas
-                            )
+        if not issparse(X):
+            (self._out_n_lambdas,
+            self._intercepts,
+            self._comp_coef,
+            self._p_comp_coef,
+            self._n_comp_coef,
+            self.null_dev,
+            self.exp_dev,
+            self.out_lambdas,
+            self._n_passes,
+            self._error_flag) = _glmnet.lognet(
+                                    self.alpha, 
+                                    self.y_level_count,
+                                    X,
+                                    y, 
+                                    self.offsets,
+                                    self.excl_preds, 
+                                    self.rel_penalties,
+                                    self.box_constraints,
+                                    self.max_vars_all, 
+                                    self.frac_lg_lambda, 
+                                    self.lambdas,
+                                    self.threshold, 
+                                    nlam=self.n_lambdas
+                                )
+        else:
+            ind_ptrs = X.indptr + 1
+            indices = X.indices + 1
+            (self._out_n_lambdas,
+            self._intercepts,
+            self._comp_coef,
+            self._p_comp_coef,
+            self._n_comp_coef,
+            self.null_dev,
+            self.exp_dev,
+            self.out_lambdas,
+            self._n_passes,
+            self._error_flag) = _glmnet.splognet(
+                                    self.alpha, 
+                                    X.shape[0],
+                                    X.shape[1],
+                                    self.y_level_count,
+                                    X.data,
+                                    ind_ptrs,
+                                    indices,
+                                    y, 
+                                    self.offsets,
+                                    self.excl_preds, 
+                                    self.rel_penalties,
+                                    self.box_constraints,
+                                    self.max_vars_all, 
+                                    self.frac_lg_lambda, 
+                                    self.lambdas,
+                                    self.threshold, 
+                                    nlam=self.n_lambdas
+                                )
         self._check_errors()
         # Keep some model metadata
         self._n_fit_obs, self._n_fit_params = X.shape
@@ -223,6 +266,12 @@ def _max_lambda(self, X, y, weights=None):
         scale as the weights, which causes the normalization factor to drop
         out of the equation.
         '''
+        if issparse(X):
+            return self._max_lambda_sparse(X, y, weights)
+        else:
+            return self._max_lambda_dense(X, y, weights)
+
+    def _max_lambda_dense(self, X, y, weights=None):
         if weights is not None:
             raise ValueError("LogisticNet cannot be fit with weights.")
         X_scaled = preprocessing.scale(X)
@@ -241,6 +290,31 @@ def _max_lambda(self, X, y, weights=None):
         alpha = self.alpha if self.alpha > .0001 else .0001
         return np.max(np.abs(dots)) / alpha
 
+    def _max_lambda_sparse(self, X, y, weights=None):
+        if weights is not None:
+            raise ValueError("LogisticNet cannot be fit with weights.")
+        # Sparse dot product
+        dot = self._get_dot(X)
+        # Calculation is modeled on weighted least squares
+        p = np.mean(y)
+        working_resp = np.log(p/(1-p)) + (y - p) / (p*(1 - p))
+        working_weights = p*(1 - p) / (X.shape[0])
+        # Sample expectataion value of the columns in a matrix
+        E = lambda M: np.asarray(M.sum(axis=0)).ravel() / M.shape[0]
+        mu = E(X)
+        mu_2 = E(X.multiply(X))
+        sigma = np.sqrt(mu_2 - mu*mu)
+        # Calculating the dot product of y with X standardized, without 
+        # destorying the sparsity of X
+        y_wtd = working_resp * working_weights
+        dots = 1/sigma * (dot(y_wtd, X) - mu * np.sum(y_wtd))
+        normfac = np.sum(working_weights)
+        # An alpha of zero (ridge) breaks the maximum lambda logic, the 
+        # coefficients are never all zero - so we readjust to a small
+        # value.
+        alpha = self.alpha if self.alpha > .0001 else .0001
+        return np.max(np.abs(dots)) / alpha
+
     def predict(self, X):
         '''Return model predictions on the probability scale.'''
         return 1 / ( 1 + np.exp(self._predict_lp(X)) )