Remove the dask_ml dev dependency (#641)

* Copy transformers from dask_ml.preprocessing Only keep the non-dask specific parts * Replace dask_ml transformers with included ones * Also add ColumnTransformer from dask * Remove final dask_ml imports * Update docstring and remove dask_ml references * Add license for dasak_ml as per the conditions * Remove dask_ml from environment.yml * Add changelog entry * Remove release date [skip ci] * Remove link in docs that leads nowhere * Get rid of unnecessary parts of some transformers * Get rid of DummyEncoder * Get rid of separate categorical_util module * Where we are going we don't need long docstrings * Delete even more unnecessary checks * Separate tutorial dependencies into their own file * Pin minimum pandas version Remove some code that is no longer needed
Quantco · May 30, 2023 · 990ce1e · 990ce1e
1 parent c9eb22d
commit 990ce1e
Show file tree

Hide file tree

Showing 6 changed files with 131 additions and 18 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,10 @@ Changelog
 
 - Fix the ``glm_benchmarks_analyze`` command line tool. See `here <https://github.com/Quantco/glum/issues/642>`_.
 
+**Other changes:**
+
+- Remove dev dependency on ``dask_ml``.
+
 
 2.5.1 - 2023-05-19
 ------------------

diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -48,6 +48,9 @@ We strongly suggest to use ``mamba`` instead of ``conda`` as this provides the s
    # If you want to install the dependencies necessary for benchmarking against other GLM packages:
    mamba env update -n glum --file environment-benchmark.yml
 
+   # If you want to work on the tutorial notebooks:
+   mamba env update -n glum --file environment-tutorial.yml
+
    # Activate the previously created conda environment
    conda activate glum
 
@@ -118,6 +121,9 @@ Alternatively, if you install `entr <http://eradman.com/entrproject/>`_, then yo
    cd docs
    ./dev
 
+.. note::
+   The tutorial notebooks are not executed as part of the documentation build. If you want to modify them, make sure to execute them manually and save the output before committing. Also don't forget to install the extra dependencies for the tutorial notebooks as described above.
+
 If you are a newbie to Sphinx, the links below may help get you up to speed on some of the trickier aspects:
 
 * `An idiot's guide to Sphinx <https://samnicholls.net/2016/06/15/how-to-sphinx-readthedocs/>`_
@@ -136,7 +142,7 @@ What follows is a high-level summary of the source code structure. For more deta
 
 * ``_glm.py`` - This is the main entrypoint and implements the core logic of the GLM. Most of the code in this file handles input arguments and prepares the data for the GLM fitting algorithm.
 * ``_glm_cv.py`` - This is the entrypoint for the cross validated GLM implementation. It depends on a lot of the code in ``_glm.py`` and only modifies the sections necessary for running training many models with different regularization parameters.
-* ``_solvers.py`` - This contains the bulk of the IRLS and L-BFGS algorithms for training GLMs. For details on the algorithm, see :doc:`background/background` for more details.
+* ``_solvers.py`` - This contains the bulk of the IRLS and L-BFGS algorithms for training GLMs.
 * ``_cd_fast.pyx`` - This is a Cython implementation of the coordinate descent algorithm used for fitting L1 penalty GLMs. Note the ``.pyx`` extension indicating that it is a Cython source file.
 * ``_distribution.py`` - definitions of the distributions that can be used. Includes Normal, Poisson, Gamma, InverseGaussian, Tweedie, Binomial and GeneralizedHyperbolicSecant distributions. 
 * ``_link.py`` - definitions of the link functions that can be used. Includes identity, log, logit and Tweedie link functions.

diff --git a/environment-tutorial.yml b/environment-tutorial.yml
@@ -0,0 +1,10 @@
+name: glum
+channels:
+  - conda-forge
+  - nodefaults
+dependencies:
+  - altair # used in docs/tutorials/rossman
+  - openml # used for downloading datasets in the tutorials.
+  - dask-ml>=2022.5.27 # used in tutorials rossman and insurance
+  - shapely # used in docs/tutorials/regularization_housing_data
+  - libpysal # used in docs/tutorials/regularization_housing_data
diff --git a/environment.yml b/environment.yml
@@ -7,15 +7,14 @@ dependencies:
   # for the conda packages. please put those `conda.recipe/meta.yaml`!!
   - libblas>=0=*mkl  # comment this line out for macOS arm64
   - numexpr
-  - pandas
+  - pandas>=0.21
   - tabmat>=3.1.0
   - scikit-learn>=0.23
   - scipy
   - tqdm
 
   # development tools
   - black
-  - dask-ml>=2022.5.27 # used for data creation for the benchmark problems
   - flake8
   - git_root
   - ipdb
@@ -34,20 +33,19 @@ dependencies:
   - cxx-compiler
   - cython
 
+  # required for tests
+  - statsmodels
+
   # documentation dev
-  - altair # used in docs/tutorials/rossman
   - jinja2
   - jupyterlab
   - jupytext
-  - libpysal # used in docs/tutorials/regularization_housing_data
   - make
   - matplotlib-base
   - nbclassic>=0.2.8
   - nbsphinx>=0.8.3
-  - openml # used for downloading datasets in the tutorials.
-  - shapely # used in docs/tutorials/regularization_housing_data
   - sphinx>=3.5.3
   - sphinx_rtd_theme
   - sphinxcontrib-apidoc
   - sphinxext-altair
-  - statsmodels
+
diff --git a/src/glum_benchmarks/data/create_insurance.py b/src/glum_benchmarks/data/create_insurance.py
@@ -1,13 +1,16 @@
 import os
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 import pandas as pd
-from dask_ml.compose import make_column_transformer
-from dask_ml.preprocessing import Categorizer, OrdinalEncoder
+import sklearn.compose
 from git_root import git_root
+from pandas.api.types import is_categorical_dtype
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.compose._column_transformer import _get_transformer_list
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import FunctionTransformer
+from sklearn.utils.validation import check_is_fitted
 
 from ..util import exposure_and_offset_to_weights
 
@@ -94,14 +97,107 @@ def create_insurance_raw_data(verbose=False) -> None:
     df.to_parquet(out_path)
 
 
+class Categorizer(BaseEstimator, TransformerMixin):
+    """Transform columns of a DataFrame to categorical dtype."""
+
+    def fit(
+        self, X: pd.DataFrame, y: Optional[Union[np.ndarray, pd.Series]] = None
+    ) -> "Categorizer":
+        """Find the categorical columns."""
+        columns = X.columns
+        categories = {}
+        for name in columns:
+            col = X[name]
+            if not is_categorical_dtype(col):
+                col = pd.Series(col, index=X.index).astype("category")
+            categories[name] = col.dtype
+
+        self.columns_ = columns
+        self.categories_ = categories
+        return self
+
+    def transform(
+        self, X: pd.DataFrame, y: Optional[Union[np.ndarray, pd.Series]] = None
+    ) -> pd.DataFrame:
+        """Transform the columns in ``X`` according to ``self.categories_``."""
+        check_is_fitted(self, "categories_")
+        categories = self.categories_
+
+        for k, dtype in categories.items():
+            if not isinstance(dtype, pd.api.types.CategoricalDtype):
+                dtype = pd.api.types.CategoricalDtype(*dtype)
+            X[k] = X[k].astype(dtype)
+
+        return X
+
+
 def get_categorizer(col_name: str, name="cat") -> Tuple[str, Categorizer]:
+    """Get a Categorizer."""
+    return name, Categorizer()
+
+
+class OrdinalEncoder(BaseEstimator, TransformerMixin):
+    """Ordinal (integer) encode categorical columns."""
+
+    def fit(
+        self, X: pd.DataFrame, y: Optional[Union[np.ndarray, pd.Series]] = None
+    ) -> "OrdinalEncoder":
+        """Determine the categorical columns to be encoded."""
+        self.categorical_columns_ = X.select_dtypes(include=["category"]).columns
+        return self
+
+    def transform(
+        self, X: pd.DataFrame, y: Optional[Union[np.ndarray, pd.Series]] = None
+    ) -> pd.DataFrame:
+        """Ordinal encode the categorical columns in X."""
+        check_is_fitted(self, "categorical_columns_")
+        X = X.copy()
+        for col in self.categorical_columns_:
+            X[col] = X[col].cat.codes
+        return X
+
+
+class ColumnTransformer(sklearn.compose.ColumnTransformer):
+    """Applies transformers to columns of a pandas DataFrame.
+    Returns a `pandas.DataFrame`, but otherwise behaves like
+    `sklearn.compose.ColumnTransformer`.
+    See the `sklearn.compose.ColumnTransformer` documentation for more information.
     """
-    Get a dask_ml Categorizer.
 
-    Categorizer only operates on object columns unless you explicitly pass the column
-    name.
-    """
-    return name, Categorizer(columns=[col_name])
+    def __init__(
+        self,
+        transformers,
+        remainder="drop",
+        sparse_threshold=0.3,
+        n_jobs=1,
+        transformer_weights=None,
+    ):
+        super().__init__(
+            transformers=transformers,
+            remainder=remainder,
+            sparse_threshold=sparse_threshold,
+            n_jobs=n_jobs,
+            transformer_weights=transformer_weights,
+        )
+
+    def _hstack(self, Xs: Iterable[Union[pd.Series, pd.DataFrame]]):
+        """Stacks X horizontally."""
+        return pd.concat(Xs, axis="columns")
+
+
+def make_column_transformer(*transformers, remainder: str = "drop"):  # noqa: D103
+    # This is identical to scikit-learn's. We're just using our
+    # ColumnTransformer instead.
+    transformer_list = _get_transformer_list(transformers)
+    return ColumnTransformer(
+        transformer_list,
+        remainder=remainder,
+    )
+
+
+make_column_transformer.__doc__ = getattr(  # noqa: B009
+    sklearn.compose.make_column_transformer, "__doc__"
+)
 
 
 def func_returns_df(

diff --git a/src/glum_benchmarks/problems.py b/src/glum_benchmarks/problems.py
@@ -6,7 +6,6 @@
 import numpy as np
 import pandas as pd
 import tabmat as tm
-from dask_ml.preprocessing import DummyEncoder
 from git_root import git_root
 from joblib import Memory
 from scipy.sparse import csc_matrix
@@ -71,7 +70,7 @@ def transform_col(i: int, dtype) -> Union[pd.DataFrame, tm.CategoricalMatrix]:
         if dtype.name == "category":
             if storage == "cat":
                 return tm.CategoricalMatrix(X_in.iloc[:, i])
-            return DummyEncoder().fit_transform(X_in.iloc[:, [i]])
+            return pd.get_dummies(X_in.iloc[:, i], drop_first=False)
         return X_in.iloc[:, [i]]
 
     mat_parts = [transform_col(i, dtype) for i, dtype in enumerate(X_in.dtypes)]