meta module

koaning · Dec 15, 2024 · e1b2520 · e1b2520
1 parent bac7e83
commit e1b2520
Show file tree

Hide file tree

Showing 19 changed files with 129 additions and 62 deletions.
diff --git a/sklego/meta/confusion_balancer.py b/sklego/meta/confusion_balancer.py
@@ -2,7 +2,8 @@
 from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
 from sklearn.metrics import confusion_matrix
 from sklearn.utils.multiclass import unique_labels
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, check_X_y
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 from sklego.base import ProbabilisticClassifier
 
@@ -63,7 +64,9 @@ def fit(self, X, y):
             If the underlying estimator does not have a `predict_proba` method.
         """
 
-        X, y = check_X_y(X, y, estimator=self.estimator, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
+        _check_n_features(self, X, reset=True)
+
         if not isinstance(self.estimator, ProbabilisticClassifier):
             raise ValueError(
                 "The ConfusionBalancer meta model only works on classification models with .predict_proba."
@@ -72,7 +75,6 @@ def fit(self, X, y):
         self.classes_ = unique_labels(y)
         cfm = confusion_matrix(y, self.estimator_.predict(X)).T + self.cfm_smooth
         self.cfm_ = cfm / cfm.sum(axis=1).reshape(-1, 1)
-        self.n_features_in_ = X.shape[1]
         return self
 
     def predict_proba(self, X):
@@ -90,7 +92,8 @@ def predict_proba(self, X):
             The predicted values.
         """
         check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
-        X = check_array(X, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
         preds = self.estimator_.predict_proba(X)
         return (1 - self.alpha) * preds + self.alpha * preds @ self.cfm_
 
@@ -108,5 +111,6 @@ def predict(self, X):
             The predicted values.
         """
         check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
-        X = check_array(X, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
         return self.classes_[self.predict_proba(X).argmax(axis=1)]
diff --git a/sklego/meta/decay_estimator.py b/sklego/meta/decay_estimator.py
@@ -1,11 +1,12 @@
 from sklearn import clone
 from sklearn.base import BaseEstimator, MetaEstimatorMixin
-from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_X_y
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 from sklego.meta._decay_utils import exponential_decay, linear_decay, sigmoid_decay, stepwise_decay
 
 
-class DecayEstimator(BaseEstimator, MetaEstimatorMixin):
+class DecayEstimator(MetaEstimatorMixin, BaseEstimator):
     """Morphs an estimator such that the training weights can be adapted to ensure that points that are far away have
     less weight.
 
@@ -123,7 +124,8 @@ def fit(self, X, y):
         """
 
         if self.check_input:
-            X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, ensure_min_features=0)
+            X, y = validate_data(self, X, y, dtype=FLOAT_DTYPES, reset=True)
+        _check_n_features(self, X, reset=True)
 
         if self.decay_func in self._ALLOWED_DECAYS.keys():
             self.decay_func_ = self._ALLOWED_DECAYS[self.decay_func]
@@ -144,7 +146,6 @@ def fit(self, X, y):
         if self._is_classifier():
             self.classes_ = self.estimator_.classes_
 
-        self.n_features_in_ = X.shape[1]
         return self
 
     def predict(self, X):
@@ -169,3 +170,6 @@ def predict(self, X):
     def score(self, X, y):
         """Alias for `.score()` method of the underlying estimator."""
         return self.estimator_.score(X, y)
+
+    def __sklearn_tags__(self):
+        return self.model.__sklearn_tags__()
diff --git a/sklego/meta/estimator_transformer.py b/sklego/meta/estimator_transformer.py
@@ -1,6 +1,7 @@
 from sklearn import clone
 from sklearn.base import BaseEstimator, MetaEstimatorMixin, TransformerMixin
-from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_X_y
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class EstimatorTransformer(TransformerMixin, MetaEstimatorMixin, BaseEstimator):
@@ -52,7 +53,9 @@ def fit(self, X, y, **kwargs):
         """
 
         if self.check_input:
-            X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, multi_output=True)
+            X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, multi_output=True, reset=True)
+
+        _check_n_features(self, X, reset=True)
 
         self.multi_output_ = len(y.shape) > 1
         self.estimator_ = clone(self.estimator)
@@ -76,5 +79,9 @@ def transform(self, X):
         """
 
         check_is_fitted(self, "estimator_")
+        if self.check_input:
+            X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         output = getattr(self.estimator_, self.predict_func)(X)
         return output if self.multi_output_ else output.reshape(-1, 1)
diff --git a/sklego/meta/grouped_predictor.py b/sklego/meta/grouped_predictor.py
@@ -401,6 +401,11 @@ def _estimator_type(self):
     def _more_tags(self):
         return {"allow_nan": True}
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
 
 class GroupedRegressor(RegressorMixin, GroupedPredictor):
     """`GroupedRegressor` is a meta-estimator that fits a separate regressor for each group in the input data.

diff --git a/sklego/meta/grouped_transformer.py b/sklego/meta/grouped_transformer.py
@@ -213,6 +213,11 @@ def transform(self, X):
     def _more_tags(self):
         return {"allow_nan": True}
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
     def get_feature_names_out(self) -> List[str]:
         "Alias for the `feature_names_out_` attribute defined during fit."
         return self.feature_names_out_
diff --git a/sklego/meta/hierarchical_predictor.py b/sklego/meta/hierarchical_predictor.py
@@ -179,7 +179,7 @@ class HierarchicalPredictor(ShrinkageMixin, MetaEstimatorMixin, BaseEstimator):
         Number of features in the training data.
     n_features_ : int
         Number of features used by the estimators.
-    n_levels_ : int
+    n_fitted_levels_  : int
         Number of hierarchical levels in the grouping.
     """
 
@@ -341,8 +341,8 @@ def _predict_estimators(self, X, method_name):
             else:  # binary case with `method_name = "decision_function"`
                 n_out = 1
 
-        preds = np.zeros((X.shape[0], self.n_levels_, n_out), dtype=float)
-        shrinkage = np.zeros((X.shape[0], self.n_levels_), dtype=float)
+        preds = np.zeros((X.shape[0], self.n_fitted_levels_, n_out), dtype=float)
+        shrinkage = np.zeros((X.shape[0], self.n_fitted_levels_), dtype=float)
 
         for level_idx, grp_names in enumerate(self.fitted_levels_):
             for grp_values, grp_frame in frame.group_by(grp_names):
@@ -363,7 +363,10 @@ def _predict_estimators(self, X, method_name):
 
                 preds[np.ix_(grp_idx, [level_idx], last_dim_ix)] = np.atleast_3d(raw_pred[:, None])
                 shrinkage[np.ix_(grp_idx)] = np.pad(
-                    _shrinkage_factor, (0, self.n_levels_ - len(_shrinkage_factor)), "constant", constant_values=(0)
+                    _shrinkage_factor,
+                    (0, self.n_fitted_levels_ - len(_shrinkage_factor)),
+                    "constant",
+                    constant_values=(0),
                 )
 
         return (preds * np.atleast_3d(shrinkage)).sum(axis=1).squeeze()
@@ -423,6 +426,11 @@ def n_levels_(self):
     def _more_tags(self):
         return {"allow_nan": True}
 
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
+
 
 class HierarchicalRegressor(RegressorMixin, HierarchicalPredictor):
     """A hierarchical regressor that predicts values using hierarchical grouping.

diff --git a/sklego/meta/ordinal_classification.py b/sklego/meta/ordinal_classification.py
@@ -3,7 +3,8 @@
 from sklearn import clone
 from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, MultiOutputMixin, is_classifier
 from sklearn.calibration import CalibratedClassifierCV
-from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
+from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class OrdinalClassifier(MultiOutputMixin, ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
@@ -129,10 +130,10 @@ def fit(self, X, y):
         if not hasattr(self.estimator, "predict_proba"):
             raise ValueError("The estimator must implement `.predict_proba()` method.")
 
-        X, y = check_X_y(X, y, estimator=self, ensure_min_samples=2)
+        X, y = validate_data(self, X=X, y=y, ensure_min_samples=2, ensure_2d=True, reset=True)
+        _check_n_features(self, X, reset=True)
 
         self.classes_ = np.sort(np.unique(y))
-        self.n_features_in_ = X.shape[1]
 
         if self.n_classes_ < 3:
             raise ValueError("`OrdinalClassifier` can't train when less than 3 classes are present.")
@@ -172,10 +173,8 @@ def predict_proba(self, X):
             If `X` has a different number of features than the one seen during `fit`.
         """
         check_is_fitted(self, ["estimators_", "classes_"])
-        X = check_array(X, ensure_2d=True, estimator=self)
-
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_} features.")
+        X = validate_data(self, X=X, ensure_2d=True, reset=False)
+        _check_n_features(self, X, reset=False)
 
         raw_proba = np.array([estimator.predict_proba(X)[:, 1] for estimator in self.estimators_.values()]).T
         p_y_le = np.column_stack((np.zeros(X.shape[0]), raw_proba, np.ones(X.shape[0])))
@@ -197,6 +196,7 @@ def predict(self, X):
             The predicted class labels.
         """
         check_is_fitted(self, ["estimators_", "classes_"])
+        X = validate_data(self, X=X, ensure_2d=True, reset=False)
         return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
 
     def _fit_binary_estimator(self, X, y, y_label):

diff --git a/sklego/meta/outlier_classifier.py b/sklego/meta/outlier_classifier.py
@@ -2,7 +2,8 @@
 from sklearn import clone
 from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
 from sklearn.calibration import _SigmoidCalibration
-from sklearn.utils.validation import check_is_fitted, check_X_y
+from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import validate_data
 
 from sklego.base import OutlierModel
 
@@ -87,7 +88,10 @@ def fit(self, X, y=None):
                 f"Passed model {self.model} does not have a `decision_function` "
                 f"method. This is required for `predict_proba` estimation."
             )
-        X, y = check_X_y(X, y)
+        if y is not None:
+            X, y = validate_data(self, X=X, y=y, reset=True)
+        else:
+            X = validate_data(self, X=X, reset=True)
         self.estimator_ = clone(self.model).fit(X, y)
         self.n_features_in_ = self.estimator_.n_features_in_
         self.classes_ = np.array([0, 1])
@@ -112,6 +116,7 @@ def predict(self, X):
             The predicted values. 0 for inliers, 1 for outliers.
         """
         check_is_fitted(self, ["estimator_", "classes_"])
+        X = validate_data(self, X=X, reset=False)
         preds = self.estimator_.predict(X)
         result = (preds == -1).astype(int)
         return result
@@ -130,6 +135,7 @@ def predict_proba(self, X):
             The predicted probabilities.
         """
         check_is_fitted(self, ["estimator_", "classes_"])
+        X = validate_data(self, X=X, reset=False)
         decision_function_scores = self.estimator_.decision_function(X)
         probabilities = self._predict_proba_sigmoid.predict(decision_function_scores).reshape(-1, 1)
         complement = np.ones_like(probabilities) - probabilities

diff --git a/sklego/meta/regression_outlier_detector.py b/sklego/meta/regression_outlier_detector.py
@@ -2,7 +2,8 @@
 import numpy as np
 from sklearn import clone
 from sklearn.base import BaseEstimator, OutlierMixin
-from sklearn.utils.validation import check_array, check_is_fitted
+from sklearn.utils.validation import check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class RegressionOutlierDetector(OutlierMixin, BaseEstimator):
@@ -135,9 +136,8 @@ def fit(self, X, y=None):
         """
         X = nw.from_native(X, eager_only=True, strict=False)
         self.idx_ = np.argmax([i == self.column for i in X.columns]) if isinstance(X, nw.DataFrame) else self.column
-        X = check_array(nw.to_native(X, strict=False), estimator=self)
-
-        self.n_features_in_ = X.shape[1]
+        X = validate_data(self, nw.to_native(X, strict=False), reset=True)
+        _check_n_features(self, X, reset=True)
 
         if not self._is_regression_model():
             raise ValueError("Passed model must be regression!")
@@ -164,7 +164,9 @@ def predict(self, X, y=None):
             The predicted values. 1 for inliers, -1 for outliers.
         """
         check_is_fitted(self, ["estimator_", "sd_", "idx_"])
-        X = check_array(X, estimator=self)
+        X = validate_data(self, X=X, reset=False)
+        _check_n_features(self, X, reset=False)
+
         X, y = self.to_x_y(X)
         preds = self.estimator_.predict(X)
         return self._handle_thresholds(y, preds)
@@ -190,7 +192,9 @@ def score_samples(self, X, y=None):
             If `method` is not one of "sd", "relative", or "absolute".
         """
         check_is_fitted(self, ["estimator_", "sd_", "idx_"])
-        X = check_array(X, estimator=self)
+        X = validate_data(self, X=X, reset=False)
+        _check_n_features(self, X, reset=False)
+
         X, y_true = self.to_x_y(X)
         y_pred = self.estimator_.predict(X)
         difference = y_true - y_pred

diff --git a/sklego/meta/subjective_classifier.py b/sklego/meta/subjective_classifier.py
@@ -3,7 +3,8 @@
 from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
 from sklearn.metrics import confusion_matrix
 from sklearn.preprocessing import normalize
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, check_X_y
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
+from sklearn_compat.utils.validation import _check_n_features, validate_data
 
 
 class SubjectiveClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
@@ -109,7 +110,9 @@ def fit(self, X, y):
         if self.evidence not in self._ALLOWED_EVIDENCE:
             raise ValueError(f"Invalid evidence: the provided evidence should be one of {self._ALLOWED_EVIDENCE}")
 
-        X, y = check_X_y(X, y, estimator=self.estimator, dtype=FLOAT_DTYPES)
+        X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
+        _check_n_features(self, X, reset=True)
+
         if set(y) - set(self.prior.keys()):
             raise ValueError(
                 f"Training data is inconsistent with prior: no prior defined for classes "
@@ -120,7 +123,6 @@ def fit(self, X, y):
         self.posterior_matrix_ = np.array(
             [[self._posterior(y, y_hat, cfm) for y_hat in range(cfm.shape[0])] for y in range(cfm.shape[0])]
         )
-        self.n_features_in_ = X.shape[1]
         return self
 
     @staticmethod
@@ -147,7 +149,9 @@ def predict_proba(self, X):
             The predicted probabilities.
         """
         check_is_fitted(self, ["posterior_matrix_"])
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         y_hats = self.estimator_.predict_proba(X)  # these are ignorant of the prior
 
         if self.evidence == "predict_proba":
@@ -171,7 +175,9 @@ def predict(self, X):
             The predicted class.
         """
         check_is_fitted(self, ["posterior_matrix_"])
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
+        _check_n_features(self, X, reset=False)
+
         return self.classes_[self.predict_proba(X).argmax(axis=1)]
 
     @property