diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 06f147a..a16f9a2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,3 @@ -# This workflow will install Python dependencies, run tests and lint with a variety of Python versions -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - name: CI on: @@ -16,7 +13,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10"] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..450fe76 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,42 @@ +name: Generate API documentation +on: + push: + branches: [ "main" ] + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@main + - name: Set up Python 3 + uses: actions/setup-python@v5 + with: + python-version: 3.11 + + - name: Install requirements for documentation generation + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install docutils pydoctor + + - name: Run pydoctor + run: | + + # Run pydoctor build + pydoctor \ + --project-name=sparsely \ + --project-url=https://github.com/$GITHUB_REPOSITORY \ + --html-viewsource-base=https://github.com/$GITHUB_REPOSITORY/tree/$GITHUB_SHA \ + --make-html \ + --html-output=./apidocs \ + --project-base-dir="$(pwd)" \ + --docformat=google \ + --intersphinx=https://docs.python.org/3/objects.inv \ + ./src/sparsely + + - name: Push API documentation to Github Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./apidocs + commit_message: "Generate API documentation" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..1c0b6e0 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,31 @@ +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/README.md b/README.md index e821d4c..f357885 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ # ⚡ sparsely ⚡ -`sparsely` is a `sklearn`-compatible Python module for sparse linear regression. It is fast, using a cutting plane algorithm that efficiently scales to thousands of samples and features. +`sparsely` is a `sklearn`-compatible Python module for sparse linear regression and classification. It is fast, using a cutting plane algorithm that efficiently scales to thousands of samples and features. This implementation follows [Bertsimas & Van Parys (2017)](https://arxiv.org/pdf/1709.10029.pdf). ## Quick start diff --git a/pyproject.toml b/pyproject.toml index 81b5519..1278883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta" [project] name = "sparsely" -version = "0.0.1" +version = "1.0.0" authors = [ { name="Joshua Ivanhoe", email="joshua.k.ivanhoe@gmail.com" }, ] -description = "Scalable sparse linear regression" +description = "Scalable sparse linear models in Python" readme = "README.md" license = {file = "LICENSE"} -requires-python = ">=3.8" +requires-python = ">=3.9,<3.12" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", diff --git a/requirements.txt b/requirements.txt index 00150b5..223a7d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -halfspace-optimizer>=0.0.3 +halfspace-optimizer>=0.1.0 scikit-learn>=1.3.2 pre-commit>=3.6.0 pytest>=7.4.4 diff --git a/src/sparsely/__init__.py b/src/sparsely/__init__.py index 867e50b..c20e5ab 100644 --- a/src/sparsely/__init__.py +++ b/src/sparsely/__init__.py @@ -1,3 +1,4 @@ -"""The `sparsely` module implements a scalable sparse linear regression model.""" +"""The `sparsely` module implements scalable sparse linear models for classification and regression.""" +from .classifier import SparseLinearClassifier from .regressor import SparseLinearRegressor from .tune import tune_estimator diff --git a/src/sparsely/base.py b/src/sparsely/base.py new file mode 100644 index 0000000..e193c16 --- /dev/null +++ b/src/sparsely/base.py @@ -0,0 +1,192 @@ +"""This module implements an abstract base class for sparse linear models. + +It contains the shared functionality used for both classification and regression models, particularly in terms +of the fitting procedure. Feature selection is optimized using a scalable cutting plane algorithm. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from numbers import Real, Integral +from typing import Optional, Callable, ClassVar + +import numpy as np +from halfspace import Model +from sklearn.base import BaseEstimator +from sklearn.preprocessing import StandardScaler +from sklearn.utils import check_random_state +from sklearn.utils._param_validation import Interval +from sklearn.utils.validation import check_is_fitted + + +class BaseSparseEstimator(BaseEstimator, ABC): + """Base class for sparse linear models. + + The features are selected using an efficient cutting plane that scales to thousands of features and samples. As + the parameters and fitting procedure are the same for both regression and classification models, this class + implements the shared functionality. + + Attributes: + k: The sparsity parameter (i.e. number of non-zero coefficients). If `None`, then `k` is set to the square root + of the number of features, rounded to the nearest integer. + gamma: The regularization parameter. If `None`, then `gamma` is set to `1 / sqrt(n_samples)`. + normalize: Whether to normalize the data before fitting the model. + max_iters: The maximum number of iterations. + tol: The tolerance for the stopping criterion. + start: The initial guess for the selected features. If `None`, then the initial guess is randomly selected. + Providing a good initial guess based on problem-specific knowledge can significantly speed up the search. + random_state: Controls the random seed for the initial guess if a user-defined initial guess is not provided. + verbose: Whether to enable logging of the search progress. + """ + + _parameter_constraints: ClassVar[dict[str, list]] = { + "k": [Interval(type=Integral, left=1, right=None, closed="left"), None], + "gamma": [Interval(type=Real, left=0, right=None, closed="left"), None], + "normalize": ["boolean"], + "max_iters": [Interval(type=Integral, left=1, right=None, closed="left")], + "tol": [Interval(type=Real, left=0, right=None, closed="left")], + "start": ["array-like", None], + "random_state": ["random_state"], + "verbose": ["boolean"], + } + + def __init__( + self, + k: Optional[int] = None, + gamma: Optional[float] = None, + normalize: bool = True, + max_iters: int = 500, + tol: float = 1e-4, + start: Optional[set[int]] = None, + random_state: Optional[int] = None, + verbose: bool = False, + ): + """Model constructor. + + Args: + k: The value for the `k` attribute. + gamma: The value for the `gamma` attribute. + normalize: The value for the `normalize` attribute. + max_iters: The value for the `max_iters` attribute. + tol: The value for the `tol` attribute. + start: The value for the `start` attribute. + random_state: The value for the `random_state` attribute. + verbose: The value for the `verbose` attribute. + """ + self.k = k + self.gamma = gamma + self.normalize = normalize + self.max_iters = max_iters + self.tol = tol + self.start = start + self.random_state = random_state + self.verbose = verbose + + def fit(self, X: np.ndarray, y: np.ndarray) -> BaseSparseEstimator: + """Fit the model to the training data. + + Args: + X: The training data. The array should be of shape (n_samples, n_features). + y: The training labels. The array-like should be of shape (n_samples,). + + Returns: + The fitted model. + """ + # Perform validation checks + X, y = self._validate_data(X=X, y=y) + self._validate_params() + + # Set hyperparameters to default values if not specified + self._k = self.k or int(np.sqrt(X.shape[1])) + self._gamma = self.gamma or 1 / np.sqrt(X.shape[0]) + + # Pre-process training data + if self.normalize: + self._scaler_X = StandardScaler() + X = self._scaler_X.fit_transform(X) + y = self._pre_process_y(y=y) + + # Initialize feature selection + if self.start is None: + rng = check_random_state(self.random_state) + start = rng.choice(X.shape[1], size=self._k, replace=False) + else: + start = self.start + + # Optimize feature selection + model = Model( + max_gap=self.tol, + max_gap_abs=self.tol, + log_freq=1 if self.verbose else None, + ) + selected = model.add_var_tensor( + shape=(X.shape[1],), var_type="B", name="selected" + ) + func = self._make_callback(X=X, y=y) + model.add_objective_term(var=selected, func=func, grad=True) + model.add_linear_constr(sum(selected) <= self._k) + model.add_linear_constr(sum(selected) >= 1) + model.start = [(selected[i], 1) for i in start] + model.optimize() + selected = np.round([model.var_value(var) for var in selected]).astype(bool) + + # Compute coefficients + self._coef = np.zeros(self.n_features_in_) + self._coef[selected] = self._fit_coef_for_subset(X_subset=X[:, selected], y=y) + + return self + + def predict(self, X: np.ndarray) -> np.ndarray: + """Predict using the fitted model. + + Args: + X: The training data. The array should be of shape (n_samples, n_features). + + Returns: + The predicted values. The array will be of shape (n_samples,). + """ + check_is_fitted(estimator=self) + self._validate_data(X=X) + if self.normalize: + X = self._scaler_X.transform(X) + return self._predict(X=X) + + @property + def coef(self) -> np.ndarray: + """Get the coefficients of the linear model.""" + check_is_fitted(estimator=self) + return self._get_coef() + + @property + def intercept(self) -> float: + """Get the intercept of the linear model.""" + check_is_fitted(estimator=self) + return self._get_intercept() + + @abstractmethod + def _pre_process_y(self, y: np.ndarray) -> np.ndarray: + pass + + @abstractmethod + def _predict(self, X: np.ndarray, proba: bool = False) -> np.ndarray: + pass + + @abstractmethod + def _get_coef(self) -> np.ndarray: + pass + + @abstractmethod + def _get_intercept(self) -> float: + pass + + @abstractmethod + def _make_callback( + self, + X: np.ndarray, + y: np.ndarray, + ) -> Callable[[np.ndarray], tuple[float, np.ndarray]]: + pass + + @abstractmethod + def _fit_coef_for_subset(self, X_subset: np.ndarray, y: np.ndarray) -> np.ndarray: + pass diff --git a/src/sparsely/classifier.py b/src/sparsely/classifier.py new file mode 100644 index 0000000..2764765 --- /dev/null +++ b/src/sparsely/classifier.py @@ -0,0 +1,97 @@ +"""This module implements a sparse linear model for classification problems.""" + +from __future__ import annotations + +from typing import Callable + +import numpy as np +from sklearn.base import ClassifierMixin +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import LabelBinarizer +from sklearn.utils.validation import check_is_fitted + +from .base import BaseSparseEstimator + + +def _sigmoid(x: np.ndarray) -> np.ndarray: + """Compute the sigmoid function.""" + return 1 / (1 + np.exp(-x)) + + +class SparseLinearClassifier(BaseSparseEstimator, ClassifierMixin): + """Sparse linear model for classification. + + Currently, only binary classification is supported. The model is trained using the logistic loss function and the + L2 regularization penalty. The optimal features are selected using a scalable cutting plane algorithm. + """ + + def predict_proba(self, X: np.ndarray) -> np.ndarray: + """Predict using the fitted regressor. + + Args: + X: The training data. The array should be of shape (n_samples, n_features). + + Returns: + The predicted values. Array of shape `(n_samples,)`. + """ + check_is_fitted(estimator=self) + self._validate_data(X=X) + if self.normalize: + X = self._scaler_X.transform(X) + return self._predict(X=X, proba=True) + + def _pre_process_y(self, y: np.ndarray) -> np.ndarray: + self._binarizer = LabelBinarizer(neg_label=np.min(y), pos_label=np.max(y)) + return 2 * self._binarizer.fit_transform(y).flatten() - 1 + + def _predict(self, X: np.ndarray, proba: bool = False) -> np.ndarray: + """Perform inference using the fitted model. + + Args: + X: The training data. The array should be of shape (n_samples, n_features). + proba: Whether to return the predicted probabilities. If `False`, then the predicted class labels are + returned instead. + + Returns: + The predicted values. The array will be of shape (n_samples,). + """ + predicted = _sigmoid(np.dot(X, self._coef) + self._intercept) + if proba: + return np.column_stack([1 - predicted, predicted]) + return self._binarizer.inverse_transform(predicted, threshold=0.5) + + def _get_coef(self) -> np.ndarray: + if self.normalize: + return self._coef / self._scaler_X.scale_ + return self._coef + + def _get_intercept(self) -> float: + if self.normalize: + return ( + self._intercept - (self._scaler_X.mean_ / self._scaler_X.scale_).sum() + ) + return self._intercept + + def _make_callback( + self, + X: np.ndarray, + y: np.ndarray, + ) -> Callable[[np.ndarray], tuple[float, np.ndarray]]: + def func(selected: np.ndarray) -> tuple[float, np.ndarray]: + X_subset = X[:, np.round(selected).astype(bool)] + coef_subset = self._fit_coef_for_subset(X_subset=X_subset, y=y) + log_odds = np.matmul(X_subset, coef_subset) + self._intercept + dual_vars = -y / (1 + np.exp(y * log_odds)) + loss = ( + dual_vars * y * np.log(-dual_vars * y) + - (1 + dual_vars * y) * np.log(1 + dual_vars * y) + ).sum() - 0.5 * self._gamma * (np.matmul(X_subset.T, dual_vars) ** 2).sum() + grad = -0.5 * self._gamma * np.matmul(X.T, dual_vars) ** 2 + return loss, grad + + return func + + def _fit_coef_for_subset(self, X_subset: np.ndarray, y) -> np.ndarray: + estimator = LogisticRegression(C=self._gamma, penalty="l2").fit(X=X_subset, y=y) + self._intercept = estimator.intercept_[0] + return estimator.coef_[0, :] diff --git a/src/sparsely/regressor.py b/src/sparsely/regressor.py index b481f75..eefc595 100644 --- a/src/sparsely/regressor.py +++ b/src/sparsely/regressor.py @@ -1,183 +1,74 @@ +"""This module implements a sparse linear model for regression problems.""" + from __future__ import annotations -from typing import Optional, Callable +from typing import Callable import numpy as np -from halfspace import Model -from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.base import RegressorMixin from sklearn.preprocessing import StandardScaler -from sklearn.utils.validation import check_scalar, check_is_fitted - - -class SparseLinearRegressor(BaseEstimator, RegressorMixin): - """Sparse linear regressor.""" - - def __init__( - self, - k: Optional[int] = None, - gamma: Optional[float] = None, - normalize: bool = True, - max_iters: int = 500, - tol: float = 1e-4, - verbose: bool = False, - ): - """Model constructor. - - Args: - k: int or `None`, default=`None` - The sparsity parameter (i.e. number of non-zero coefficients). If `None`, then `k` is set to the - square root of the number of features, rounded to the nearest integer. - gamma: float or `None`, default=`None` - The regularization parameter. If `None`, then `gamma` is set to `1 / sqrt(n_samples)`. - normalize: bool, default=`True` - Whether to normalize the data before fitting the model. - max_iters: int, default=`500` - The maximum number of iterations. - tol: float, default=`1e-4` - The tolerance for the stopping criterion. - verbose: bool, default=`False` - Whether to enable logging of the search progress. - """ - self.k = k - self.gamma = gamma - self.normalize = normalize - self.max_iters = max_iters - self.tol = tol - self.verbose = verbose - def fit(self, X: np.ndarray, y: np.ndarray) -> SparseLinearRegressor: - """Fit the regressor to the training data. +from .base import BaseSparseEstimator - Args: - X: array-like of shape (n_samples, n_features) - The training data. - y: array-like of shape (n_samples,) - The training labels. - Returns: SparseLinearRegressor - The fitted regressor. - """ - # Perform validation checks - X, y = self._validate_data(X=X, y=y) - self._validate_params() - # Set hyperparameters to default values if not specified - self.k_ = self.k or int(np.sqrt(X.shape[1])) - self.gamma_ = self.gamma or 1 / np.sqrt(X.shape[0]) +class SparseLinearRegressor(BaseSparseEstimator, RegressorMixin): + """Sparse linear model for regression. - # Pre-process training data - if self.normalize: - self.scaler_X_ = StandardScaler() - self.scaler_y_ = StandardScaler() - X = self.scaler_X_.fit_transform(X) - y = self.scaler_y_.fit_transform(y[:, None])[:, 0] - - # Optimize feature selection - model = Model( - max_gap=self.tol, max_gap_abs=self.tol, log_freq=1 if self.verbose else None - ) - selected = model.add_var_tensor( - shape=(X.shape[1],), var_type="B", name="selected" - ) - func, grad = self._make_callbacks(X=X, y=y) - model.add_objective_term(var=selected, func=func, grad=grad) - model.add_linear_constr(sum(selected) <= self.k_) - model.optimize() - selected = np.round([model.var_value(var) for var in selected]).astype(bool) - - # Compute coefficients - self.coef_ = np.zeros(self.n_features_in_) - self.coef_[selected] = self._compute_coef_for_subset( - X_subset=X[:, selected], y=y - ) + The model is trained using the L2 loss function and the L2 regularization penalty. The optimal features are + selected using a scalable cutting plane algorithm. + """ - return self + def _pre_process_y(self, y: np.ndarray) -> np.ndarray: + """Normalize the target variable.""" + self._scaler_y = StandardScaler() + return self._scaler_y.fit_transform(y[:, None])[:, 0] - def predict(self, X: np.ndarray) -> np.ndarray: - """Predict using the fitted regressor. + def _predict(self, X: np.ndarray, proba: bool = False) -> np.ndarray: + """Perform inference using the fitted model. Args: - X: array-like of shape (n_samples, n_features) - The data to predict. + X: The training data. The array should be of shape (n_samples, n_features). + proba: Not used. Exists for interoperability with the sparse linear classifier. - Returns: array-like of shape (n_samples,) - The predicted values. + Returns: + The predicted values. The array will be of shape (n_samples,). """ - check_is_fitted(estimator=self) - self._validate_data(X=X) - if self.normalize: - X = self.scaler_X_.transform(X) - predicted = np.dot(X, self.coef_) - if self.normalize: - predicted = self.scaler_y_.inverse_transform(predicted[:, None])[:, 0] - return predicted + predicted = np.dot(X, self._coef) + return self._scaler_y.inverse_transform(predicted[:, None])[:, 0] - @property - def coef(self) -> np.ndarray: - """Get the coefficients of the linear model.""" - check_is_fitted(estimator=self) + def _get_coef(self) -> np.ndarray: if self.normalize: - return self.coef_ / self.scaler_X_.scale_ * self.scaler_y_.scale_ - return self.coef_ + return self._coef / self._scaler_X.scale_ * self._scaler_y.scale_ + return self._coef - @property - def intercept(self) -> float: - """Get the intercept of the linear model.""" - check_is_fitted(estimator=self) + def _get_intercept(self) -> float: if self.normalize: return ( - -self.scaler_X_.mean_ / self.scaler_X_.scale_ * self.scaler_y_.scale_ - + self.scaler_y_.mean_ + self._scaler_y.mean_ + - (self._scaler_X.mean_ / self._scaler_X.scale_).sum() + * self._scaler_y.scale_ ) return 0 - def _validate_params(self): - if self.k is not None: - check_scalar( - x=self.k, - name="max_features", - target_type=int, - min_val=1, - max_val=self.n_features_in_, - include_boundaries="both", - ) - if self.gamma is not None: - check_scalar( - x=self.gamma, - name="gamma", - target_type=float, - min_val=0, - include_boundaries="neither", - ) - check_scalar( - x=self.normalize, - name="normalize", - target_type=bool, - ) - - def _make_callbacks( + def _make_callback( self, X: np.ndarray, y: np.ndarray, - ) -> tuple[Callable[[np.ndarray], float], Callable[[np.ndarray], np.ndarray]]: - def func(selected: np.ndarray) -> float: - X_subset = X[:, np.round(selected).astype(bool)] - coef = self._compute_coef_for_subset(X_subset=X_subset, y=y) - return 0.5 * np.dot(y, y - np.matmul(X_subset, coef)) - - def grad(selected: np.ndarray) -> np.ndarray: + ) -> Callable[[np.ndarray], tuple[float, np.ndarray]]: + def func(selected: np.ndarray) -> tuple[float, np.ndarray]: X_subset = X[:, np.round(selected).astype(bool)] - # TODO: remove redundant computation of subset coef for gradient - coef = self._compute_coef_for_subset(X_subset=X_subset, y=y) - return ( - -0.5 * self.gamma_ * np.matmul(X.T, y - np.matmul(X_subset, coef)) ** 2 - ) + coef_subset = self._fit_coef_for_subset(X_subset=X_subset, y=y) + dual_vars = y - np.matmul(X_subset, coef_subset) + loss = 0.5 * np.dot(y, dual_vars) + grad = -0.5 * self._gamma * np.matmul(X.T, dual_vars) ** 2 + return loss, grad - return func, grad + return func - def _compute_coef_for_subset(self, X_subset: np.ndarray, y) -> np.ndarray: + def _fit_coef_for_subset(self, X_subset: np.ndarray, y) -> np.ndarray: return np.matmul( np.linalg.inv( - 1 / self.gamma_ * np.eye(X_subset.shape[1]) + 1 / self._gamma * np.eye(X_subset.shape[1]) + np.matmul(X_subset.T, X_subset) ), np.matmul(X_subset.T, y), diff --git a/src/sparsely/tune.py b/src/sparsely/tune.py index f3a44b1..5e0137f 100644 --- a/src/sparsely/tune.py +++ b/src/sparsely/tune.py @@ -1,19 +1,24 @@ +"""This module implements a function to tune the sparsity parameter of a linear model using cross-validation.""" + from copy import deepcopy from typing import Optional, Union import numpy as np import pandas as pd from sklearn.model_selection import cross_validate -from tqdm.auto import tqdm from sklearn.utils.validation import check_scalar +from tqdm.auto import tqdm +from .classifier import SparseLinearClassifier from .regressor import SparseLinearRegressor +Estimator = Union[SparseLinearRegressor, SparseLinearClassifier] + def tune_estimator( X: np.ndarray, y: np.ndarray, - estimator: Optional[SparseLinearRegressor] = None, + estimator: Estimator, k_min: int = 1, k_max: int = None, step_size: int = 1, @@ -21,39 +26,30 @@ def tune_estimator( cv: int = 3, return_search_log: bool = False, show_progress_bar: bool = False, -) -> Union[SparseLinearRegressor, tuple[SparseLinearRegressor, pd.DataFrame]]: - """Tune the sparsity parameter (i.e. number of non-zero coefficients) of the linear regressor. +) -> Union[Estimator, tuple[Estimator, pd.DataFrame]]: + """Tune the sparsity parameter (i.e. number of non-zero coefficients) of a linear model. The sparsity parameter is tuned by performing a grid search over the range [k_min, k_max] with step size `step_size`. If the test score does not improve for `max_iters_no_improvement` iterations, then the search is terminated early. Args: - X: np.ndarray of shape (n_samples, n_features) - The training data. - y: np.ndarray of shape (n_samples,) - The training labels. - estimator: SparseLinearRegressor or `None`, default=`None` - The estimator to tune. If `None`, then a default SparseLinearRegressor estimator is used. - k_min: int, default=1 - The minimum value for the sparsity parameter (i.e. number of non-zero coefficients). - k_max: int or `None`, default=`None` - The maximum sparsity for the sparsity parameter (i.e. number of non-zero coefficients). If `None`, then + X: The training data. The array should be of shape (n_samples, n_features) + y: The training labels. The array should be of shape (n_samples,). + estimator: The estimator to tune. This must be a `SparseLinearRegressor` instance (for regression problems) or + a `SparseLinearClassifier` instance (for classification problems). + k_min: The minimum value for the sparsity parameter (i.e. number of non-zero coefficients). + k_max: The maximum sparsity for the sparsity parameter (i.e. number of non-zero coefficients). If `None`, then this is set to `n_features`. - step_size: int, default=1 - The step size for the search. The sparsity parameter is incremented by this value at each iteration. Must - be less than or equal to `k_max - k_min`. - max_iters_no_improvement: int or `None`, default=None - The maximum number of iterations without improvement in the CV test score before the search is terminated. - If `None`, then no early stopping is performed. - cv: int, default=3 - The number of cross-validation folds. - return_search_log: bool, default=`False` - Whether to return the search log. - show_progress_bar: bool, default=`False` - Whether to show a progress bar. + step_size: The step size for the search. The sparsity parameter is incremented by this value at each iteration. + Must be less than or equal to `k_max - k_min`. + max_iters_no_improvement: The maximum number of iterations without improvement in the CV test score before the + search is terminated. If `None`, then no early stopping is performed. + cv: The number of cross-validation folds. + return_search_log:Whether to return the search log. + show_progress_bar: Whether to show a progress bar. - Returns: SparseLinearRegressor or tuple of SparseLinearRegressor and pd.DataFrame + Returns: The tuned estimator. If `return_search_log` is `True`, then a tuple of the tuned estimator and the search log. """ # Perform validation checks @@ -93,7 +89,6 @@ def tune_estimator( check_scalar(x=cv, name="cv", target_type=int, min_val=2, include_boundaries="left") # Initialize the search - estimator = estimator or SparseLinearRegressor() best_score = -np.inf best_k = None n_iters_no_improvement = 0 @@ -109,8 +104,8 @@ def tune_estimator( X=X, y=y, cv=cv, - scoring="r2", - n_jobs=1, + scoring="auc" if isinstance(estimator, SparseLinearClassifier) else "r2", + n_jobs=1, # parallelization will interfere with the MIP solver ) # Update search and check early termination condition diff --git a/tests/conftest.py b/tests/conftest.py index 5388313..8a01c69 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,19 +1,35 @@ import numpy as np import pytest -from sklearn.datasets import make_regression +from sklearn.datasets import make_regression, make_classification from sklearn.model_selection import train_test_split @pytest.fixture -def dataset() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - """Generate a dataset for testing the regressor.""" +def regression_dataset() -> tuple[ + np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray +]: + """Generate a regression dataset.""" X, y, coef = make_regression( - n_samples=500, + n_samples=1000, n_features=10, n_informative=3, - noise=0.0, random_state=0, coef=True, ) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) return X_train, X_test, y_train, y_test, coef + + +@pytest.fixture +def classification_dataset() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Generate a classification dataset.""" + X, y = make_classification( + n_samples=1000, + n_features=10, + n_informative=3, + n_classes=2, + n_clusters_per_class=1, + random_state=0, + ) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + return X_train, X_test, y_train, y_test diff --git a/tests/test_classifier.py b/tests/test_classifier.py new file mode 100644 index 0000000..1fb1642 --- /dev/null +++ b/tests/test_classifier.py @@ -0,0 +1,50 @@ +import numpy as np +import pytest +from sklearn.metrics import roc_auc_score, balanced_accuracy_score + +from sparsely import SparseLinearClassifier + +Dataset = tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray] + + +# TODO: add tests for sklearn compatibility - requires support for multi-class problems + + +@pytest.mark.parametrize( + "estimator", + [ + SparseLinearClassifier(), + SparseLinearClassifier(normalize=False), + SparseLinearClassifier(k=3), + SparseLinearClassifier(gamma=1e-1), + ], +) +def test_sparse_linear_regressor( + classification_dataset: Dataset, estimator: SparseLinearClassifier +): + X_train, X_test, y_train, y_test = classification_dataset + estimator.fit(X_train, y_train) + predicted = estimator.predict(X_test) + predicted_proba = estimator.predict_proba(X_test) + assert estimator._coef.shape == (X_train.shape[1],) + assert predicted.shape == (X_test.shape[0],) + assert predicted_proba.shape == (X_test.shape[0], 2) + assert balanced_accuracy_score(y_test, predicted) > 0.9 + assert roc_auc_score(y_test, predicted_proba[:, 1]) > 0.9 + assert estimator._coef.shape == (X_train.shape[1],) + + +@pytest.mark.parametrize( + "estimator", + [ + SparseLinearClassifier(k=0), + SparseLinearClassifier(k=11), + SparseLinearClassifier(gamma=-1e-2), + ], +) +def test_sparse_linear_regressor_invalid_params( + classification_dataset: Dataset, estimator: SparseLinearClassifier +): + X_train, X_test, y_train, y_test = classification_dataset + with pytest.raises(ValueError): + estimator.fit(X_train, y_train) diff --git a/tests/test_regressor.py b/tests/test_regressor.py index 6512a76..0ffbd0d 100644 --- a/tests/test_regressor.py +++ b/tests/test_regressor.py @@ -2,7 +2,7 @@ import pytest from sklearn.utils.estimator_checks import check_estimator -from sparsely.regressor import SparseLinearRegressor +from sparsely import SparseLinearRegressor Dataset = tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray] @@ -20,16 +20,18 @@ def test_sklearn_compatibility(): SparseLinearRegressor(gamma=1e-2), ], ) -def test_sparse_linear_regressor(dataset: Dataset, estimator: SparseLinearRegressor): - X_train, X_test, y_train, y_test, coef = dataset +def test_sparse_linear_regressor( + regression_dataset: Dataset, estimator: SparseLinearRegressor +): + X_train, X_test, y_train, y_test, coef = regression_dataset predicted = estimator.fit(X_train, y_train).predict(X_test) - assert estimator.coef_.shape == (X_train.shape[1],) + assert estimator._coef.shape == (X_train.shape[1],) assert predicted.shape == (X_test.shape[0],) - assert estimator.score(X_train, y_train) > 0.8 - assert estimator.score(X_test, y_test) > 0.8 - assert estimator.coef_.shape == (X_train.shape[1],) - assert (~np.isclose(coef, 0)).sum() <= estimator.k_ - assert (np.isclose(estimator.coef_, 0) == np.isclose(coef, 0)).all() + assert estimator.score(X_train, y_train) > 0.95 + assert estimator.score(X_test, y_test) > 0.95 + assert estimator._coef.shape == (X_train.shape[1],) + assert (~np.isclose(coef, 0)).sum() <= estimator._k + assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all() @pytest.mark.parametrize( @@ -41,8 +43,8 @@ def test_sparse_linear_regressor(dataset: Dataset, estimator: SparseLinearRegres ], ) def test_sparse_linear_regressor_invalid_params( - dataset: Dataset, estimator: SparseLinearRegressor + regression_dataset: Dataset, estimator: SparseLinearRegressor ): - X_train, X_test, y_train, y_test, coef = dataset + X_train, X_test, y_train, y_test, coef = regression_dataset with pytest.raises(ValueError): estimator.fit(X_train, y_train) diff --git a/tests/test_tune.py b/tests/test_tune.py index 24677f3..91f1483 100644 --- a/tests/test_tune.py +++ b/tests/test_tune.py @@ -1,20 +1,24 @@ -from sparsely import tune_estimator import numpy as np import pandas as pd import pytest +from sparsely import tune_estimator, SparseLinearRegressor + @pytest.mark.parametrize("max_iters_no_improvement", [None, 1]) @pytest.mark.parametrize("return_search_log", [True, False]) def test_tune_estimator( - dataset: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray], + regression_dataset: tuple[ + np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray + ], max_iters_no_improvement: int, return_search_log: bool, ): - X_train, X_test, y_train, y_test, coef = dataset + X_train, X_test, y_train, y_test, coef = regression_dataset output = tune_estimator( X_train, y_train, + estimator=SparseLinearRegressor(), k_min=1, k_max=5, max_iters_no_improvement=max_iters_no_improvement, @@ -32,6 +36,6 @@ def test_tune_estimator( estimator = output assert estimator.score(X_train, y_train) > 0.8 assert estimator.score(X_test, y_test) > 0.8 - assert estimator.coef_.shape == (X_train.shape[1],) - assert (~np.isclose(coef, 0)).sum() <= estimator.k_ - assert (np.isclose(estimator.coef_, 0) == np.isclose(coef, 0)).all() + assert estimator._coef.shape == (X_train.shape[1],) + assert (~np.isclose(coef, 0)).sum() <= estimator._k + assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all()