diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 06f147a..a16f9a2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,6 +1,3 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
-
 name: CI
 
 on:
@@ -16,7 +13,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10"]
+        python-version: ["3.9", "3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v3
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..450fe76
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,42 @@
+name: Generate API documentation
+on:
+  push:
+    branches: [ "main" ]
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@main
+    - name: Set up Python 3
+      uses: actions/setup-python@v5
+      with:
+        python-version: 3.11
+
+    - name: Install requirements for documentation generation
+      run: |
+        python -m pip install --upgrade pip setuptools wheel
+        python -m pip install docutils pydoctor
+
+    - name: Run pydoctor
+      run: |
+
+        # Run pydoctor build
+        pydoctor \
+            --project-name=sparsely \
+            --project-url=https://github.com/$GITHUB_REPOSITORY \
+            --html-viewsource-base=https://github.com/$GITHUB_REPOSITORY/tree/$GITHUB_SHA \
+            --make-html \
+            --html-output=./apidocs \
+            --project-base-dir="$(pwd)" \
+            --docformat=google \
+            --intersphinx=https://docs.python.org/3/objects.inv \
+            ./src/sparsely
+
+    - name: Push API documentation to Github Pages
+      uses: peaceiris/actions-gh-pages@v3
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        publish_dir: ./apidocs
+        commit_message: "Generate API documentation"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..1c0b6e0
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,31 @@
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.11'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/README.md b/README.md
index e821d4c..f357885 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 
 
 # ⚡ sparsely ⚡
-`sparsely` is a `sklearn`-compatible Python module for sparse linear regression. It is fast, using a cutting plane algorithm that efficiently scales to thousands of samples and features.
+`sparsely` is a `sklearn`-compatible Python module for sparse linear regression and classification. It is fast, using a cutting plane algorithm that efficiently scales to thousands of samples and features.
 This implementation follows [Bertsimas & Van Parys (2017)](https://arxiv.org/pdf/1709.10029.pdf).
 
 ## Quick start
diff --git a/pyproject.toml b/pyproject.toml
index 81b5519..1278883 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sparsely"
-version = "0.0.1"
+version = "1.0.0"
 authors = [
   { name="Joshua Ivanhoe", email="joshua.k.ivanhoe@gmail.com" },
 ]
-description = "Scalable sparse linear regression"
+description = "Scalable sparse linear models in Python"
 readme = "README.md"
 license = {file = "LICENSE"}
-requires-python = ">=3.8"
+requires-python = ">=3.9,<3.12"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
diff --git a/requirements.txt b/requirements.txt
index 00150b5..223a7d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-halfspace-optimizer>=0.0.3
+halfspace-optimizer>=0.1.0
 scikit-learn>=1.3.2
 pre-commit>=3.6.0
 pytest>=7.4.4
diff --git a/src/sparsely/__init__.py b/src/sparsely/__init__.py
index 867e50b..c20e5ab 100644
--- a/src/sparsely/__init__.py
+++ b/src/sparsely/__init__.py
@@ -1,3 +1,4 @@
-"""The `sparsely` module implements a scalable sparse linear regression model."""
+"""The `sparsely` module implements scalable sparse linear models for classification and regression."""
+from .classifier import SparseLinearClassifier
 from .regressor import SparseLinearRegressor
 from .tune import tune_estimator
diff --git a/src/sparsely/base.py b/src/sparsely/base.py
new file mode 100644
index 0000000..e193c16
--- /dev/null
+++ b/src/sparsely/base.py
@@ -0,0 +1,192 @@
+"""This module implements an abstract base class for sparse linear models.
+
+It contains the shared functionality used for both classification and regression models, particularly in terms
+of the fitting procedure. Feature selection is optimized using a scalable cutting plane algorithm.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from numbers import Real, Integral
+from typing import Optional, Callable, ClassVar
+
+import numpy as np
+from halfspace import Model
+from sklearn.base import BaseEstimator
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import check_random_state
+from sklearn.utils._param_validation import Interval
+from sklearn.utils.validation import check_is_fitted
+
+
+class BaseSparseEstimator(BaseEstimator, ABC):
+    """Base class for sparse linear models.
+
+    The features are selected using an efficient cutting plane that scales to thousands of features and samples. As
+    the parameters and fitting procedure are the same for both regression and classification models, this class
+    implements the shared functionality.
+
+    Attributes:
+        k: The sparsity parameter (i.e. number of non-zero coefficients). If `None`, then `k` is set to the square root
+            of the number of features, rounded to the nearest integer.
+        gamma: The regularization parameter. If `None`, then `gamma` is set to `1 / sqrt(n_samples)`.
+        normalize: Whether to normalize the data before fitting the model.
+        max_iters: The maximum number of iterations.
+        tol: The tolerance for the stopping criterion.
+        start: The initial guess for the selected features. If `None`, then the initial guess is randomly selected.
+            Providing a good initial guess based on problem-specific knowledge can significantly speed up the search.
+        random_state: Controls the random seed for the initial guess if a user-defined initial guess is not provided.
+        verbose: Whether to enable logging of the search progress.
+    """
+
+    _parameter_constraints: ClassVar[dict[str, list]] = {
+        "k": [Interval(type=Integral, left=1, right=None, closed="left"), None],
+        "gamma": [Interval(type=Real, left=0, right=None, closed="left"), None],
+        "normalize": ["boolean"],
+        "max_iters": [Interval(type=Integral, left=1, right=None, closed="left")],
+        "tol": [Interval(type=Real, left=0, right=None, closed="left")],
+        "start": ["array-like", None],
+        "random_state": ["random_state"],
+        "verbose": ["boolean"],
+    }
+
+    def __init__(
+        self,
+        k: Optional[int] = None,
+        gamma: Optional[float] = None,
+        normalize: bool = True,
+        max_iters: int = 500,
+        tol: float = 1e-4,
+        start: Optional[set[int]] = None,
+        random_state: Optional[int] = None,
+        verbose: bool = False,
+    ):
+        """Model constructor.
+
+        Args:
+            k: The value for the `k` attribute.
+            gamma: The value for the `gamma` attribute.
+            normalize: The value for the `normalize` attribute.
+            max_iters: The value for the `max_iters` attribute.
+            tol: The value for the `tol` attribute.
+            start: The value for the `start` attribute.
+            random_state: The value for the `random_state` attribute.
+            verbose: The value for the `verbose` attribute.
+        """
+        self.k = k
+        self.gamma = gamma
+        self.normalize = normalize
+        self.max_iters = max_iters
+        self.tol = tol
+        self.start = start
+        self.random_state = random_state
+        self.verbose = verbose
+
+    def fit(self, X: np.ndarray, y: np.ndarray) -> BaseSparseEstimator:
+        """Fit the model to the training data.
+
+        Args:
+            X: The training data. The array should be of shape (n_samples, n_features).
+            y: The training labels. The array-like should be of shape (n_samples,).
+
+        Returns:
+            The fitted model.
+        """
+        # Perform validation checks
+        X, y = self._validate_data(X=X, y=y)
+        self._validate_params()
+
+        # Set hyperparameters to default values if not specified
+        self._k = self.k or int(np.sqrt(X.shape[1]))
+        self._gamma = self.gamma or 1 / np.sqrt(X.shape[0])
+
+        # Pre-process training data
+        if self.normalize:
+            self._scaler_X = StandardScaler()
+            X = self._scaler_X.fit_transform(X)
+        y = self._pre_process_y(y=y)
+
+        # Initialize feature selection
+        if self.start is None:
+            rng = check_random_state(self.random_state)
+            start = rng.choice(X.shape[1], size=self._k, replace=False)
+        else:
+            start = self.start
+
+        # Optimize feature selection
+        model = Model(
+            max_gap=self.tol,
+            max_gap_abs=self.tol,
+            log_freq=1 if self.verbose else None,
+        )
+        selected = model.add_var_tensor(
+            shape=(X.shape[1],), var_type="B", name="selected"
+        )
+        func = self._make_callback(X=X, y=y)
+        model.add_objective_term(var=selected, func=func, grad=True)
+        model.add_linear_constr(sum(selected) <= self._k)
+        model.add_linear_constr(sum(selected) >= 1)
+        model.start = [(selected[i], 1) for i in start]
+        model.optimize()
+        selected = np.round([model.var_value(var) for var in selected]).astype(bool)
+
+        # Compute coefficients
+        self._coef = np.zeros(self.n_features_in_)
+        self._coef[selected] = self._fit_coef_for_subset(X_subset=X[:, selected], y=y)
+
+        return self
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        """Predict using the fitted model.
+
+        Args:
+            X: The training data. The array should be of shape (n_samples, n_features).
+
+        Returns:
+            The predicted values. The array will be of shape (n_samples,).
+        """
+        check_is_fitted(estimator=self)
+        self._validate_data(X=X)
+        if self.normalize:
+            X = self._scaler_X.transform(X)
+        return self._predict(X=X)
+
+    @property
+    def coef(self) -> np.ndarray:
+        """Get the coefficients of the linear model."""
+        check_is_fitted(estimator=self)
+        return self._get_coef()
+
+    @property
+    def intercept(self) -> float:
+        """Get the intercept of the linear model."""
+        check_is_fitted(estimator=self)
+        return self._get_intercept()
+
+    @abstractmethod
+    def _pre_process_y(self, y: np.ndarray) -> np.ndarray:
+        pass
+
+    @abstractmethod
+    def _predict(self, X: np.ndarray, proba: bool = False) -> np.ndarray:
+        pass
+
+    @abstractmethod
+    def _get_coef(self) -> np.ndarray:
+        pass
+
+    @abstractmethod
+    def _get_intercept(self) -> float:
+        pass
+
+    @abstractmethod
+    def _make_callback(
+        self,
+        X: np.ndarray,
+        y: np.ndarray,
+    ) -> Callable[[np.ndarray], tuple[float, np.ndarray]]:
+        pass
+
+    @abstractmethod
+    def _fit_coef_for_subset(self, X_subset: np.ndarray, y: np.ndarray) -> np.ndarray:
+        pass
diff --git a/src/sparsely/classifier.py b/src/sparsely/classifier.py
new file mode 100644
index 0000000..2764765
--- /dev/null
+++ b/src/sparsely/classifier.py
@@ -0,0 +1,97 @@
+"""This module implements a sparse linear model for classification problems."""
+
+from __future__ import annotations
+
+from typing import Callable
+
+import numpy as np
+from sklearn.base import ClassifierMixin
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import LabelBinarizer
+from sklearn.utils.validation import check_is_fitted
+
+from .base import BaseSparseEstimator
+
+
+def _sigmoid(x: np.ndarray) -> np.ndarray:
+    """Compute the sigmoid function."""
+    return 1 / (1 + np.exp(-x))
+
+
+class SparseLinearClassifier(BaseSparseEstimator, ClassifierMixin):
+    """Sparse linear model for classification.
+
+    Currently, only binary classification is supported. The model is trained using the logistic loss function and the
+    L2 regularization penalty. The optimal features are selected using a scalable cutting plane algorithm.
+    """
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        """Predict using the fitted regressor.
+
+        Args:
+            X: The training data. The array should be of shape (n_samples, n_features).
+
+        Returns:
+            The predicted values. Array of shape `(n_samples,)`.
+        """
+        check_is_fitted(estimator=self)
+        self._validate_data(X=X)
+        if self.normalize:
+            X = self._scaler_X.transform(X)
+        return self._predict(X=X, proba=True)
+
+    def _pre_process_y(self, y: np.ndarray) -> np.ndarray:
+        self._binarizer = LabelBinarizer(neg_label=np.min(y), pos_label=np.max(y))
+        return 2 * self._binarizer.fit_transform(y).flatten() - 1
+
+    def _predict(self, X: np.ndarray, proba: bool = False) -> np.ndarray:
+        """Perform inference using the fitted model.
+
+        Args:
+            X: The training data. The array should be of shape (n_samples, n_features).
+            proba: Whether to return the predicted probabilities. If `False`, then the predicted class labels are
+                returned instead.
+
+        Returns:
+            The predicted values. The array will be of shape (n_samples,).
+        """
+        predicted = _sigmoid(np.dot(X, self._coef) + self._intercept)
+        if proba:
+            return np.column_stack([1 - predicted, predicted])
+        return self._binarizer.inverse_transform(predicted, threshold=0.5)
+
+    def _get_coef(self) -> np.ndarray:
+        if self.normalize:
+            return self._coef / self._scaler_X.scale_
+        return self._coef
+
+    def _get_intercept(self) -> float:
+        if self.normalize:
+            return (
+                self._intercept - (self._scaler_X.mean_ / self._scaler_X.scale_).sum()
+            )
+        return self._intercept
+
+    def _make_callback(
+        self,
+        X: np.ndarray,
+        y: np.ndarray,
+    ) -> Callable[[np.ndarray], tuple[float, np.ndarray]]:
+        def func(selected: np.ndarray) -> tuple[float, np.ndarray]:
+            X_subset = X[:, np.round(selected).astype(bool)]
+            coef_subset = self._fit_coef_for_subset(X_subset=X_subset, y=y)
+            log_odds = np.matmul(X_subset, coef_subset) + self._intercept
+            dual_vars = -y / (1 + np.exp(y * log_odds))
+            loss = (
+                dual_vars * y * np.log(-dual_vars * y)
+                - (1 + dual_vars * y) * np.log(1 + dual_vars * y)
+            ).sum() - 0.5 * self._gamma * (np.matmul(X_subset.T, dual_vars) ** 2).sum()
+            grad = -0.5 * self._gamma * np.matmul(X.T, dual_vars) ** 2
+            return loss, grad
+
+        return func
+
+    def _fit_coef_for_subset(self, X_subset: np.ndarray, y) -> np.ndarray:
+        estimator = LogisticRegression(C=self._gamma, penalty="l2").fit(X=X_subset, y=y)
+        self._intercept = estimator.intercept_[0]
+        return estimator.coef_[0, :]
diff --git a/src/sparsely/regressor.py b/src/sparsely/regressor.py
index b481f75..eefc595 100644
--- a/src/sparsely/regressor.py
+++ b/src/sparsely/regressor.py
@@ -1,183 +1,74 @@
+"""This module implements a sparse linear model for regression problems."""
+
 from __future__ import annotations
 
-from typing import Optional, Callable
+from typing import Callable
 
 import numpy as np
-from halfspace import Model
-from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.base import RegressorMixin
 from sklearn.preprocessing import StandardScaler
-from sklearn.utils.validation import check_scalar, check_is_fitted
-
-
-class SparseLinearRegressor(BaseEstimator, RegressorMixin):
-    """Sparse linear regressor."""
-
-    def __init__(
-        self,
-        k: Optional[int] = None,
-        gamma: Optional[float] = None,
-        normalize: bool = True,
-        max_iters: int = 500,
-        tol: float = 1e-4,
-        verbose: bool = False,
-    ):
-        """Model constructor.
-
-        Args:
-            k: int or `None`, default=`None`
-                The sparsity parameter (i.e. number of non-zero coefficients). If `None`, then `k` is set to the
-                square root of the number of features, rounded to the nearest integer.
-            gamma: float or `None`, default=`None`
-                The regularization parameter. If `None`, then `gamma` is set to `1 / sqrt(n_samples)`.
-            normalize: bool, default=`True`
-                Whether to normalize the data before fitting the model.
-            max_iters: int, default=`500`
-                The maximum number of iterations.
-            tol: float, default=`1e-4`
-                The tolerance for the stopping criterion.
-            verbose: bool, default=`False`
-                Whether to enable logging of the search progress.
-        """
-        self.k = k
-        self.gamma = gamma
-        self.normalize = normalize
-        self.max_iters = max_iters
-        self.tol = tol
-        self.verbose = verbose
 
-    def fit(self, X: np.ndarray, y: np.ndarray) -> SparseLinearRegressor:
-        """Fit the regressor to the training data.
+from .base import BaseSparseEstimator
 
-        Args:
-            X: array-like of shape (n_samples, n_features)
-                The training data.
-            y: array-like of shape (n_samples,)
-                The training labels.
-        Returns: SparseLinearRegressor
-            The fitted regressor.
-        """
-        # Perform validation checks
-        X, y = self._validate_data(X=X, y=y)
-        self._validate_params()
 
-        # Set hyperparameters to default values if not specified
-        self.k_ = self.k or int(np.sqrt(X.shape[1]))
-        self.gamma_ = self.gamma or 1 / np.sqrt(X.shape[0])
+class SparseLinearRegressor(BaseSparseEstimator, RegressorMixin):
+    """Sparse linear model for regression.
 
-        # Pre-process training data
-        if self.normalize:
-            self.scaler_X_ = StandardScaler()
-            self.scaler_y_ = StandardScaler()
-            X = self.scaler_X_.fit_transform(X)
-            y = self.scaler_y_.fit_transform(y[:, None])[:, 0]
-
-        # Optimize feature selection
-        model = Model(
-            max_gap=self.tol, max_gap_abs=self.tol, log_freq=1 if self.verbose else None
-        )
-        selected = model.add_var_tensor(
-            shape=(X.shape[1],), var_type="B", name="selected"
-        )
-        func, grad = self._make_callbacks(X=X, y=y)
-        model.add_objective_term(var=selected, func=func, grad=grad)
-        model.add_linear_constr(sum(selected) <= self.k_)
-        model.optimize()
-        selected = np.round([model.var_value(var) for var in selected]).astype(bool)
-
-        # Compute coefficients
-        self.coef_ = np.zeros(self.n_features_in_)
-        self.coef_[selected] = self._compute_coef_for_subset(
-            X_subset=X[:, selected], y=y
-        )
+    The model is trained using the L2 loss function and the L2 regularization penalty. The optimal features are
+    selected using a scalable cutting plane algorithm.
+    """
 
-        return self
+    def _pre_process_y(self, y: np.ndarray) -> np.ndarray:
+        """Normalize the target variable."""
+        self._scaler_y = StandardScaler()
+        return self._scaler_y.fit_transform(y[:, None])[:, 0]
 
-    def predict(self, X: np.ndarray) -> np.ndarray:
-        """Predict using the fitted regressor.
+    def _predict(self, X: np.ndarray, proba: bool = False) -> np.ndarray:
+        """Perform inference using the fitted model.
 
         Args:
-            X: array-like of shape (n_samples, n_features)
-                The data to predict.
+            X: The training data. The array should be of shape (n_samples, n_features).
+            proba: Not used. Exists for interoperability with the sparse linear classifier.
 
-        Returns: array-like of shape (n_samples,)
-            The predicted values.
+        Returns:
+            The predicted values. The array will be of shape (n_samples,).
         """
-        check_is_fitted(estimator=self)
-        self._validate_data(X=X)
-        if self.normalize:
-            X = self.scaler_X_.transform(X)
-        predicted = np.dot(X, self.coef_)
-        if self.normalize:
-            predicted = self.scaler_y_.inverse_transform(predicted[:, None])[:, 0]
-        return predicted
+        predicted = np.dot(X, self._coef)
+        return self._scaler_y.inverse_transform(predicted[:, None])[:, 0]
 
-    @property
-    def coef(self) -> np.ndarray:
-        """Get the coefficients of the linear model."""
-        check_is_fitted(estimator=self)
+    def _get_coef(self) -> np.ndarray:
         if self.normalize:
-            return self.coef_ / self.scaler_X_.scale_ * self.scaler_y_.scale_
-        return self.coef_
+            return self._coef / self._scaler_X.scale_ * self._scaler_y.scale_
+        return self._coef
 
-    @property
-    def intercept(self) -> float:
-        """Get the intercept of the linear model."""
-        check_is_fitted(estimator=self)
+    def _get_intercept(self) -> float:
         if self.normalize:
             return (
-                -self.scaler_X_.mean_ / self.scaler_X_.scale_ * self.scaler_y_.scale_
-                + self.scaler_y_.mean_
+                self._scaler_y.mean_
+                - (self._scaler_X.mean_ / self._scaler_X.scale_).sum()
+                * self._scaler_y.scale_
             )
         return 0
 
-    def _validate_params(self):
-        if self.k is not None:
-            check_scalar(
-                x=self.k,
-                name="max_features",
-                target_type=int,
-                min_val=1,
-                max_val=self.n_features_in_,
-                include_boundaries="both",
-            )
-        if self.gamma is not None:
-            check_scalar(
-                x=self.gamma,
-                name="gamma",
-                target_type=float,
-                min_val=0,
-                include_boundaries="neither",
-            )
-        check_scalar(
-            x=self.normalize,
-            name="normalize",
-            target_type=bool,
-        )
-
-    def _make_callbacks(
+    def _make_callback(
         self,
         X: np.ndarray,
         y: np.ndarray,
-    ) -> tuple[Callable[[np.ndarray], float], Callable[[np.ndarray], np.ndarray]]:
-        def func(selected: np.ndarray) -> float:
-            X_subset = X[:, np.round(selected).astype(bool)]
-            coef = self._compute_coef_for_subset(X_subset=X_subset, y=y)
-            return 0.5 * np.dot(y, y - np.matmul(X_subset, coef))
-
-        def grad(selected: np.ndarray) -> np.ndarray:
+    ) -> Callable[[np.ndarray], tuple[float, np.ndarray]]:
+        def func(selected: np.ndarray) -> tuple[float, np.ndarray]:
             X_subset = X[:, np.round(selected).astype(bool)]
-            # TODO: remove redundant computation of subset coef for gradient
-            coef = self._compute_coef_for_subset(X_subset=X_subset, y=y)
-            return (
-                -0.5 * self.gamma_ * np.matmul(X.T, y - np.matmul(X_subset, coef)) ** 2
-            )
+            coef_subset = self._fit_coef_for_subset(X_subset=X_subset, y=y)
+            dual_vars = y - np.matmul(X_subset, coef_subset)
+            loss = 0.5 * np.dot(y, dual_vars)
+            grad = -0.5 * self._gamma * np.matmul(X.T, dual_vars) ** 2
+            return loss, grad
 
-        return func, grad
+        return func
 
-    def _compute_coef_for_subset(self, X_subset: np.ndarray, y) -> np.ndarray:
+    def _fit_coef_for_subset(self, X_subset: np.ndarray, y) -> np.ndarray:
         return np.matmul(
             np.linalg.inv(
-                1 / self.gamma_ * np.eye(X_subset.shape[1])
+                1 / self._gamma * np.eye(X_subset.shape[1])
                 + np.matmul(X_subset.T, X_subset)
             ),
             np.matmul(X_subset.T, y),
diff --git a/src/sparsely/tune.py b/src/sparsely/tune.py
index f3a44b1..5e0137f 100644
--- a/src/sparsely/tune.py
+++ b/src/sparsely/tune.py
@@ -1,19 +1,24 @@
+"""This module implements a function to tune the sparsity parameter of a linear model using cross-validation."""
+
 from copy import deepcopy
 from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import cross_validate
-from tqdm.auto import tqdm
 from sklearn.utils.validation import check_scalar
+from tqdm.auto import tqdm
 
+from .classifier import SparseLinearClassifier
 from .regressor import SparseLinearRegressor
 
+Estimator = Union[SparseLinearRegressor, SparseLinearClassifier]
+
 
 def tune_estimator(
     X: np.ndarray,
     y: np.ndarray,
-    estimator: Optional[SparseLinearRegressor] = None,
+    estimator: Estimator,
     k_min: int = 1,
     k_max: int = None,
     step_size: int = 1,
@@ -21,39 +26,30 @@ def tune_estimator(
     cv: int = 3,
     return_search_log: bool = False,
     show_progress_bar: bool = False,
-) -> Union[SparseLinearRegressor, tuple[SparseLinearRegressor, pd.DataFrame]]:
-    """Tune the sparsity parameter (i.e. number of non-zero coefficients) of the linear regressor.
+) -> Union[Estimator, tuple[Estimator, pd.DataFrame]]:
+    """Tune the sparsity parameter (i.e. number of non-zero coefficients) of a linear model.
 
     The sparsity parameter is tuned by performing a grid search over the range [k_min, k_max] with step size
     `step_size`. If the test score does not improve for `max_iters_no_improvement` iterations, then the search is
     terminated early.
 
     Args:
-        X: np.ndarray of shape (n_samples, n_features)
-            The training data.
-        y: np.ndarray of shape (n_samples,)
-            The training labels.
-        estimator: SparseLinearRegressor or `None`, default=`None`
-            The estimator to tune. If `None`, then a default SparseLinearRegressor estimator is used.
-        k_min: int, default=1
-            The minimum value for the sparsity parameter (i.e. number of non-zero coefficients).
-        k_max: int or `None`, default=`None`
-            The maximum sparsity for the sparsity parameter (i.e. number of non-zero coefficients). If `None`, then
+        X: The training data. The array should be of shape (n_samples, n_features)
+        y: The training labels. The array should be of shape (n_samples,).
+        estimator: The estimator to tune. This must be a `SparseLinearRegressor` instance (for regression problems) or
+            a `SparseLinearClassifier` instance (for classification problems).
+        k_min: The minimum value for the sparsity parameter (i.e. number of non-zero coefficients).
+        k_max: The maximum sparsity for the sparsity parameter (i.e. number of non-zero coefficients). If `None`, then
             this is set to `n_features`.
-        step_size: int, default=1
-            The step size for the search. The sparsity parameter is incremented by this value at each iteration. Must
-            be less than or equal to `k_max - k_min`.
-        max_iters_no_improvement: int or `None`, default=None
-            The maximum number of iterations without improvement in the CV test score before the search is terminated.
-            If `None`, then no early stopping is performed.
-        cv: int, default=3
-            The number of cross-validation folds.
-        return_search_log: bool, default=`False`
-            Whether to return the search log.
-        show_progress_bar: bool, default=`False`
-            Whether to show a progress bar.
+        step_size: The step size for the search. The sparsity parameter is incremented by this value at each iteration.
+            Must be less than or equal to `k_max - k_min`.
+        max_iters_no_improvement: The maximum number of iterations without improvement in the CV test score before the
+            search is terminated. If `None`, then no early stopping is performed.
+        cv: The number of cross-validation folds.
+        return_search_log:Whether to return the search log.
+        show_progress_bar: Whether to show a progress bar.
 
-    Returns: SparseLinearRegressor or tuple of SparseLinearRegressor and pd.DataFrame
+    Returns:
         The tuned estimator. If `return_search_log` is `True`, then a tuple of the tuned estimator and the search log.
     """
     # Perform validation checks
@@ -93,7 +89,6 @@ def tune_estimator(
     check_scalar(x=cv, name="cv", target_type=int, min_val=2, include_boundaries="left")
 
     # Initialize the search
-    estimator = estimator or SparseLinearRegressor()
     best_score = -np.inf
     best_k = None
     n_iters_no_improvement = 0
@@ -109,8 +104,8 @@ def tune_estimator(
             X=X,
             y=y,
             cv=cv,
-            scoring="r2",
-            n_jobs=1,
+            scoring="auc" if isinstance(estimator, SparseLinearClassifier) else "r2",
+            n_jobs=1,  # parallelization will interfere with the MIP solver
         )
 
         # Update search and check early termination condition
diff --git a/tests/conftest.py b/tests/conftest.py
index 5388313..8a01c69 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,19 +1,35 @@
 import numpy as np
 import pytest
-from sklearn.datasets import make_regression
+from sklearn.datasets import make_regression, make_classification
 from sklearn.model_selection import train_test_split
 
 
 @pytest.fixture
-def dataset() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-    """Generate a dataset for testing the regressor."""
+def regression_dataset() -> tuple[
+    np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray
+]:
+    """Generate a regression dataset."""
     X, y, coef = make_regression(
-        n_samples=500,
+        n_samples=1000,
         n_features=10,
         n_informative=3,
-        noise=0.0,
         random_state=0,
         coef=True,
     )
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
     return X_train, X_test, y_train, y_test, coef
+
+
+@pytest.fixture
+def classification_dataset() -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Generate a classification dataset."""
+    X, y = make_classification(
+        n_samples=1000,
+        n_features=10,
+        n_informative=3,
+        n_classes=2,
+        n_clusters_per_class=1,
+        random_state=0,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+    return X_train, X_test, y_train, y_test
diff --git a/tests/test_classifier.py b/tests/test_classifier.py
new file mode 100644
index 0000000..1fb1642
--- /dev/null
+++ b/tests/test_classifier.py
@@ -0,0 +1,50 @@
+import numpy as np
+import pytest
+from sklearn.metrics import roc_auc_score, balanced_accuracy_score
+
+from sparsely import SparseLinearClassifier
+
+Dataset = tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]
+
+
+# TODO: add tests for sklearn compatibility - requires support for multi-class problems
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        SparseLinearClassifier(),
+        SparseLinearClassifier(normalize=False),
+        SparseLinearClassifier(k=3),
+        SparseLinearClassifier(gamma=1e-1),
+    ],
+)
+def test_sparse_linear_regressor(
+    classification_dataset: Dataset, estimator: SparseLinearClassifier
+):
+    X_train, X_test, y_train, y_test = classification_dataset
+    estimator.fit(X_train, y_train)
+    predicted = estimator.predict(X_test)
+    predicted_proba = estimator.predict_proba(X_test)
+    assert estimator._coef.shape == (X_train.shape[1],)
+    assert predicted.shape == (X_test.shape[0],)
+    assert predicted_proba.shape == (X_test.shape[0], 2)
+    assert balanced_accuracy_score(y_test, predicted) > 0.9
+    assert roc_auc_score(y_test, predicted_proba[:, 1]) > 0.9
+    assert estimator._coef.shape == (X_train.shape[1],)
+
+
+@pytest.mark.parametrize(
+    "estimator",
+    [
+        SparseLinearClassifier(k=0),
+        SparseLinearClassifier(k=11),
+        SparseLinearClassifier(gamma=-1e-2),
+    ],
+)
+def test_sparse_linear_regressor_invalid_params(
+    classification_dataset: Dataset, estimator: SparseLinearClassifier
+):
+    X_train, X_test, y_train, y_test = classification_dataset
+    with pytest.raises(ValueError):
+        estimator.fit(X_train, y_train)
diff --git a/tests/test_regressor.py b/tests/test_regressor.py
index 6512a76..0ffbd0d 100644
--- a/tests/test_regressor.py
+++ b/tests/test_regressor.py
@@ -2,7 +2,7 @@
 import pytest
 from sklearn.utils.estimator_checks import check_estimator
 
-from sparsely.regressor import SparseLinearRegressor
+from sparsely import SparseLinearRegressor
 
 Dataset = tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]
 
@@ -20,16 +20,18 @@ def test_sklearn_compatibility():
         SparseLinearRegressor(gamma=1e-2),
     ],
 )
-def test_sparse_linear_regressor(dataset: Dataset, estimator: SparseLinearRegressor):
-    X_train, X_test, y_train, y_test, coef = dataset
+def test_sparse_linear_regressor(
+    regression_dataset: Dataset, estimator: SparseLinearRegressor
+):
+    X_train, X_test, y_train, y_test, coef = regression_dataset
     predicted = estimator.fit(X_train, y_train).predict(X_test)
-    assert estimator.coef_.shape == (X_train.shape[1],)
+    assert estimator._coef.shape == (X_train.shape[1],)
     assert predicted.shape == (X_test.shape[0],)
-    assert estimator.score(X_train, y_train) > 0.8
-    assert estimator.score(X_test, y_test) > 0.8
-    assert estimator.coef_.shape == (X_train.shape[1],)
-    assert (~np.isclose(coef, 0)).sum() <= estimator.k_
-    assert (np.isclose(estimator.coef_, 0) == np.isclose(coef, 0)).all()
+    assert estimator.score(X_train, y_train) > 0.95
+    assert estimator.score(X_test, y_test) > 0.95
+    assert estimator._coef.shape == (X_train.shape[1],)
+    assert (~np.isclose(coef, 0)).sum() <= estimator._k
+    assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all()
 
 
 @pytest.mark.parametrize(
@@ -41,8 +43,8 @@ def test_sparse_linear_regressor(dataset: Dataset, estimator: SparseLinearRegres
     ],
 )
 def test_sparse_linear_regressor_invalid_params(
-    dataset: Dataset, estimator: SparseLinearRegressor
+    regression_dataset: Dataset, estimator: SparseLinearRegressor
 ):
-    X_train, X_test, y_train, y_test, coef = dataset
+    X_train, X_test, y_train, y_test, coef = regression_dataset
     with pytest.raises(ValueError):
         estimator.fit(X_train, y_train)
diff --git a/tests/test_tune.py b/tests/test_tune.py
index 24677f3..91f1483 100644
--- a/tests/test_tune.py
+++ b/tests/test_tune.py
@@ -1,20 +1,24 @@
-from sparsely import tune_estimator
 import numpy as np
 import pandas as pd
 import pytest
 
+from sparsely import tune_estimator, SparseLinearRegressor
+
 
 @pytest.mark.parametrize("max_iters_no_improvement", [None, 1])
 @pytest.mark.parametrize("return_search_log", [True, False])
 def test_tune_estimator(
-    dataset: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray],
+    regression_dataset: tuple[
+        np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray
+    ],
     max_iters_no_improvement: int,
     return_search_log: bool,
 ):
-    X_train, X_test, y_train, y_test, coef = dataset
+    X_train, X_test, y_train, y_test, coef = regression_dataset
     output = tune_estimator(
         X_train,
         y_train,
+        estimator=SparseLinearRegressor(),
         k_min=1,
         k_max=5,
         max_iters_no_improvement=max_iters_no_improvement,
@@ -32,6 +36,6 @@ def test_tune_estimator(
         estimator = output
     assert estimator.score(X_train, y_train) > 0.8
     assert estimator.score(X_test, y_test) > 0.8
-    assert estimator.coef_.shape == (X_train.shape[1],)
-    assert (~np.isclose(coef, 0)).sum() <= estimator.k_
-    assert (np.isclose(estimator.coef_, 0) == np.isclose(coef, 0)).all()
+    assert estimator._coef.shape == (X_train.shape[1],)
+    assert (~np.isclose(coef, 0)).sum() <= estimator._k
+    assert (np.isclose(estimator._coef, 0) == np.isclose(coef, 0)).all()