From a9fae90ebaa05b1ba1b473ed39625d6ee4371a77 Mon Sep 17 00:00:00 2001
From: Ramon Bussing <45173421+Ram0nB@users.noreply.github.com>
Date: Fri, 20 Oct 2023 11:58:04 +0200
Subject: [PATCH] [ENH] multiple quantile regression (#108)

Fixes #107

For quantile regression, often more than one quantile probability is of
interest. However, existing Sklearn compatible quantile regressors
always fit and predict a single quantile probability. To the best of my
knowdlegde, there is no standardized way to integrate multiple quantile
regression with Sktime/Skpro probabilistic prediction methods such as
predict_quantiles/predict_intervals. This PR adds new Skpro regressor
that wraps multiple quantile regressors and supports probabilistic
predictions from wrapped regressors.
---
 .all-contributorsrc                      |  11 +
 docs/source/api_reference/regression.rst |   8 +
 skpro/regression/multiquantile.py        | 380 +++++++++++++++++++++++
 3 files changed, 399 insertions(+)
 create mode 100644 skpro/regression/multiquantile.py

diff --git a/.all-contributorsrc b/.all-contributorsrc
index e1b2c539f..dc18319f9 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -88,6 +88,17 @@
         "ideas",
         "review"
       ]
+    },
+    {
+      "login": "Ram0nB",
+      "name": "Ramon Bussing",
+      "avatar_url": "https://avatars.githubusercontent.com/u/45173421",
+      "profile": "https://github.com/Ram0nB",
+      "contributions": [
+        "code",
+        "doc",
+        "ideas"
+      ]
     }
   ]
 }
diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst
index 197998d09..ae769ac75 100644
--- a/docs/source/api_reference/regression.rst
+++ b/docs/source/api_reference/regression.rst
@@ -63,6 +63,14 @@ take one or multiple ``sklearn`` estimators and adda probabilistic prediction mo
 
     ResidualDouble
 
+.. currentmodule:: skpro.regression.multiquantile
+
+.. autosummary::
+    :toctree: auto_generated/
+    :template: class.rst
+
+    MultipleQuantileRegressor
+
 Base
 ----
 
diff --git a/skpro/regression/multiquantile.py b/skpro/regression/multiquantile.py
new file mode 100644
index 000000000..045684314
--- /dev/null
+++ b/skpro/regression/multiquantile.py
@@ -0,0 +1,380 @@
+"""Multiple quantile regression."""
+
+__all__ = ["MultipleQuantileRegressor"]
+__author__ = ["Ram0nB"]
+
+import numpy as np
+import pandas as pd
+from joblib import Parallel, delayed
+from sklearn.base import clone
+
+from skpro.regression.base import BaseProbaRegressor
+
+
+class MultipleQuantileRegressor(BaseProbaRegressor):
+    r"""Multiple quantile regressor.
+
+    A tabular quantile regressor typically regresses a single probability level,
+    however, often more than one quantile probability is of interest. Therefore, this
+    multiple quantile regressor wraps multiple quantile regressors for multiple quantile
+    probabilities so that each probability has one corresponding regressor. After
+    fitting, all quantile regressors can be used to make probabilistic predictions.
+
+    In `fit`, for every probability in alpha, the quantile_regressor is cloned and the
+    probability is set. Subsequently all regressors are fitted.
+
+    In probabilistic predict-like methods, if predictions of a quantile are requested,
+    the fitted quantile regressor that is nearest to the desired quantile probability
+    is used for the prediction, of the requested quantile.
+
+    For instance, let :math:`\alpha = [\alpha_1, \alpha_2, \ldots, \alpha_n]`
+    be the `alpha` provided to `__init__`, and
+    let :math:`\alpha' = [\alpha'_1, \alpha'_2, \ldots, \alpha'_m]` be the quantiles
+    requested in `predict_quantiles`.
+    Then, we use the fitted quantile regressor at quantile :math:`\hat{\alpha}_j`,
+    :math:`\hat{\alpha}_j := \underset{i = 1 \dots n}{\mathrm{argmin}}\ | \alpha'_j
+    - \alpha_i |` to make the quantile prediction for the requested quantile
+    probability :math:`\alpha'_j`.
+
+    Consistently, the `predict_proba` method returns an empirical
+    distribution with supports at quantile points corresponding to `alpha`,
+    and weights corresponding to the nearest quantile regressor.
+
+    Parameters
+    ----------
+    quantile_regressor : Sklearn compatible quantile regressor
+        Tabular quantile regressor for probabilistic prediction methods.
+    alpha_name : str
+        Alpha parameter name that sets the quantile probability level for the
+        quantile_regressor.
+    alpha : list with float
+        A list of probabilities in the open interval (0, 1).
+        For each probability, a quantile_regressor will be fit.
+    mean_regressor : Sklearn compatible regressor or None, default=None
+        Tabular mean regressor for `predict`. Can't predict mean if not provided.
+    n_jobs : int or None, default=None
+        The number of jobs to run in parallel for `fit` and all probabilistic prediction
+        methods. -1 means using all processors, -2 means using all except one processors
+        and None means no parallelization.
+    sort_quantiles : bool, default=False
+        For the quantile estimates, the lack of monotinicity can be a problem caused by
+        the crossing of quantiles, also known as the quantile crossing problem. This can
+        happen because each quantile is predicted by a single independent instance of
+        the quantile_regressor. To resolve this issue, the quantiles for all predictions
+        per row index will be sorted if sort_quantiles=True. This methodology will never
+        lead to a higher quantile loss and solves the quantile crossing problem [1]_.
+
+    Attributes
+    ----------
+    regressors_ : dict
+        All fitted regressors.
+        The keys are the probabilities in alpha and "mean" for the mean.
+
+    References
+    ----------
+    .. [1] Victor Chernozhukov, Iván Fernández-Val, and Alfred Galichon. Quantile and
+       probability curves without crossing. Econometrica, 78(3):1093-1125, 2010.
+
+    Examples
+    --------
+    >>> from sklearn.linear_model import QuantileRegressor, LinearRegression
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn.datasets import load_diabetes
+    >>> from skpro.regression.multiquantile import MultipleQuantileRegressor
+    >>> X, y = load_diabetes(return_X_y=True, as_frame=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
+    >>> alpha = [0.1, 0.3, 0.5, 0.7, 0.9]
+    >>> mqr = MultipleQuantileRegressor(
+    ...     mean_regressor=LinearRegression(),
+    ...     quantile_regressor=QuantileRegressor(solver="highs"),
+    ...     alpha_name="quantile",
+    ...     alpha=alpha,
+    ...     n_jobs=-1,
+    ... )
+    >>> mqr = mqr.fit(X_train, y_train)
+    >>> predicted_quantiles = mqr.predict_quantiles(X_test, alpha)
+    """
+
+    _tags = {
+        "capability:missing": False,
+        "capability:multioutput": False,
+    }
+
+    def __init__(
+        self,
+        quantile_regressor,
+        alpha_name,
+        alpha,
+        mean_regressor=None,
+        n_jobs=None,
+        sort_quantiles=False,
+    ):
+        self.quantile_regressor = quantile_regressor
+        self.alpha_name = alpha_name
+        self.alpha = alpha
+        self.mean_regressor = mean_regressor
+        self.n_jobs = n_jobs
+        self.sort_quantiles = sort_quantiles
+
+        # input checks
+        self._no_mean_reg = True if mean_regressor is None else False
+        if len(alpha) == 0:
+            raise ValueError("at least one value in alpha is required.")
+        elif np.amin(alpha) <= 0 or np.amax(alpha) >= 1:
+            raise ValueError(
+                "values in alpha must lie in the open interval (0, 1), "
+                f"but found alpha: {alpha}."
+            )
+
+        super().__init__()
+
+    def _fit(self, X, y):
+        """Fit regressor to training data.
+
+        Writes to self:
+            Sets fitted model attributes ending in "_".
+
+        Parameters
+        ----------
+        X : pandas DataFrame
+            feature instances to fit regressor to
+        y : pandas DataFrame, Series, must be same length as X
+            labels to fit regressor to
+
+        Returns
+        -------
+        self : reference to self
+        """
+        # clone, set alpha and list all regressors
+        regressors = [clone(self.mean_regressor)] if not self._no_mean_reg else []
+        for a in self.alpha:
+            q_est = clone(self.quantile_regressor)
+            q_est = q_est.set_params(**{self.alpha_name: a})
+            q_est_p = q_est.get_params()
+            if q_est_p[self.alpha_name] != a:
+                raise ValueError("Can't set alpha for quantile regressor.")
+            else:
+                regressors.append(q_est)
+
+        # fit regressors
+        def _fit_regressor(X, y, regressor):
+            return regressor.fit(X=X, y=y)
+
+        X_fit = X.values
+        y_fit = y.values.flatten()
+        fitted_regressors = Parallel(n_jobs=self.n_jobs)(
+            delayed(_fit_regressor)(X_fit, y_fit, regressor) for regressor in regressors
+        )
+
+        # put fitted regressors in dict and write to self
+        regressors_ = {}
+        if not self._no_mean_reg:
+            regressors_["mean"] = fitted_regressors.pop(0)
+        for i, a in enumerate(self.alpha):
+            regressors_[a] = fitted_regressors[i]
+        self.regressors_ = regressors_
+
+        # write varname to self ("capability:multioutput": False -> 1 column)
+        self._y_varname = y.columns[0]
+
+        return self
+
+    def _predict(self, X):
+        """Predict labels for data from features.
+
+        State required:
+            Requires state to be "fitted".
+
+        Accesses in self:
+            Fitted model attributes ending in "_"
+
+        Parameters
+        ----------
+        X : pandas DataFrame, must have same columns as X in `fit`
+            data to predict labels for
+
+        Returns
+        -------
+        y : pandas DataFrame, same length as `X`
+            labels predicted for `X`
+        """
+        if self._no_mean_reg:
+            raise ValueError("can't predict: no mean_regressor provided.")
+
+        # predict
+        X_pred = X.values
+        preds = self.regressors_["mean"].predict(X_pred)
+
+        # format predictions as DataFrame
+        preds = np.array(preds).flatten()
+        preds = pd.DataFrame(preds, columns=[self._y_varname])
+        preds.index = X.index
+
+        return preds
+
+    def _predict_quantiles(self, X, alpha):
+        """Compute/return quantile predictions.
+
+        private _predict_quantiles containing the core logic,
+            called from predict_quantiles and default _predict_interval
+
+        Parameters
+        ----------
+        X : pandas DataFrame, must have same columns as X in `fit`
+            data to predict labels for
+        alpha : guaranteed list of float
+            A list of probabilities at which quantile predictions are computed.
+
+        Returns
+        -------
+        quantiles : pd.DataFrame
+            Column has multi-index: first level is variable name from ``y`` in fit,
+                second level being the values of alpha passed to the function.
+            Row index is equal to row index of ``X``.
+            Entries are quantile predictions, for var in col index,
+                at quantile probability in second col index, for the row index.
+        """
+        # per a in alpha, list fitted regressor that is nearest to a
+        regressors = []
+        for a in sorted(alpha):
+            nearest_fitted_a = min(self.alpha, key=lambda p: abs(p - a))
+            regressors.append(self.regressors_[nearest_fitted_a])
+
+        # predict
+        def _predict_quantile(X, regressor):
+            pred = regressor.predict(X=X)
+            pred = np.array(pred).flatten()
+            return pred
+
+        X_pred = X.values
+        preds = Parallel(n_jobs=self.n_jobs)(
+            delayed(_predict_quantile)(X_pred, regressor) for regressor in regressors
+        )
+
+        # format predictions as DataFrame
+        y_cols = [self._y_varname]
+        columns_sorted = pd.MultiIndex.from_product([y_cols, sorted(alpha)])
+        columns = pd.MultiIndex.from_product([y_cols, alpha])
+        index = X.index
+        preds = pd.DataFrame(np.transpose(preds), columns=columns_sorted, index=index)
+
+        # solve quantile crossing problem
+        if self.sort_quantiles:
+            preds = preds.apply(
+                lambda row: row.sort_values().values, axis=1, result_type="broadcast"
+            )
+
+        # sort columns in the order of alpha requested
+        preds = preds.reindex(columns=columns)
+
+        return preds
+
+    def _predict_proba(self, X):
+        """Predict distribution over labels for data from features.
+
+        State required:
+            Requires state to be "fitted".
+
+        Accesses in self:
+            Fitted model attributes ending in "_"
+
+        Parameters
+        ----------
+        X : pandas DataFrame, must have same columns as X in `fit`
+            data to predict labels for
+
+        Returns
+        -------
+        y : skpro BaseDistribution, same length as `X`
+            labels predicted for `X`
+        """
+        from skpro.distributions import Empirical
+
+        alpha_sorted = sorted(self.alpha)
+
+        # get quantile prediction for all fitted quantile regressors
+        quantile_preds = self._predict_quantiles(X, alpha_sorted)
+
+        # format as emprical sample for empirical distr
+        # row multiindex: (alpha, X.index)
+        # column index  : as y in fit
+        empirical_spl = quantile_preds.stack(level=1).swaplevel(0, 1)
+
+        # obtain alpha weights for empirical distr such that we take the nearest
+        # available quantile prob (the cum weights match the chosen quantile prob)
+        alpha_np = np.array(alpha_sorted)
+        alpha_diff = np.diff(alpha_np)
+        alpha_diff2 = np.repeat(alpha_diff, 2) / 2
+        weight_double = np.concatenate([[alpha_np[0]], alpha_diff2, [1 - alpha_np[-1]]])
+        weight_double2 = weight_double.reshape(-1, 2)
+        weights = weight_double2.sum(axis=1)
+
+        # obtain weights per emprical sample
+        empirical_spl_weights = pd.Series(index=empirical_spl.index)
+        for i, a in enumerate(alpha_sorted):
+            empirical_spl_weights.loc[a] = weights[i]
+
+        y_pred_proba = Empirical(
+            empirical_spl,
+            weights=empirical_spl_weights,
+            index=X.index,
+            columns=[self._y_varname],
+        )
+
+        return y_pred_proba
+
+    @classmethod
+    def get_test_params(cls):
+        """
+        Return testing parameter settings for the regressor.
+
+        Returns
+        -------
+        params : dict or list of dict, default = {}
+            Parameters to create testing instances of the class
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
+            `create_test_instance` uses the first (or only) dictionary in `params`
+        """
+        from sklearn.ensemble import GradientBoostingRegressor
+        from sklearn.linear_model import LinearRegression, QuantileRegressor
+
+        from skpro.regression.tests.test_all_regressors import TEST_ALPHAS
+
+        # extract all alpha values from TEST_ALPHAS
+        alpha = []
+        for a in TEST_ALPHAS:
+            if isinstance(a, list):
+                for a2 in a:
+                    alpha.append(a2)
+            else:
+                alpha.append(a)
+
+        # TEST_ALPHAS also used for predict_interval coverage, so we also need the
+        # corresponding coverage quantile probabilities
+        coverage_alpha = []
+        for a in alpha:
+            coverage_alpha.extend([0.5 - 0.5 * float(a), 0.5 + 0.5 * float(a)])
+        alpha += coverage_alpha
+        alpha = sorted(list(set(alpha)))
+
+        params = [
+            {
+                "mean_regressor": LinearRegression(),
+                "quantile_regressor": QuantileRegressor(solver="highs"),
+                "alpha_name": "quantile",
+                "alpha": alpha,
+                "n_jobs": -1,
+                "sort_quantiles": True,
+            },
+            {
+                "mean_regressor": GradientBoostingRegressor(),
+                "quantile_regressor": GradientBoostingRegressor(loss="quantile"),
+                "alpha_name": "alpha",
+                "alpha": alpha,
+                "n_jobs": -1,
+                "sort_quantiles": True,
+            },
+        ]
+
+        return params