Skip to content

Commit

Permalink
meta module
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi committed Dec 15, 2024
1 parent bac7e83 commit e1b2520
Show file tree
Hide file tree
Showing 19 changed files with 129 additions and 62 deletions.
14 changes: 9 additions & 5 deletions sklego/meta/confusion_balancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, check_X_y
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data

from sklego.base import ProbabilisticClassifier

Expand Down Expand Up @@ -63,7 +64,9 @@ def fit(self, X, y):
If the underlying estimator does not have a `predict_proba` method.
"""

X, y = check_X_y(X, y, estimator=self.estimator, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

if not isinstance(self.estimator, ProbabilisticClassifier):
raise ValueError(
"The ConfusionBalancer meta model only works on classification models with .predict_proba."
Expand All @@ -72,7 +75,6 @@ def fit(self, X, y):
self.classes_ = unique_labels(y)
cfm = confusion_matrix(y, self.estimator_.predict(X)).T + self.cfm_smooth
self.cfm_ = cfm / cfm.sum(axis=1).reshape(-1, 1)
self.n_features_in_ = X.shape[1]
return self

def predict_proba(self, X):
Expand All @@ -90,7 +92,8 @@ def predict_proba(self, X):
The predicted values.
"""
check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
X = check_array(X, dtype=FLOAT_DTYPES)
X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)
preds = self.estimator_.predict_proba(X)
return (1 - self.alpha) * preds + self.alpha * preds @ self.cfm_

Expand All @@ -108,5 +111,6 @@ def predict(self, X):
The predicted values.
"""
check_is_fitted(self, ["cfm_", "classes_", "estimator_"])
X = check_array(X, dtype=FLOAT_DTYPES)
X = validate_data(self, X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)
return self.classes_[self.predict_proba(X).argmax(axis=1)]
12 changes: 8 additions & 4 deletions sklego/meta/decay_estimator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from sklearn import clone
from sklearn.base import BaseEstimator, MetaEstimatorMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_X_y
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data

from sklego.meta._decay_utils import exponential_decay, linear_decay, sigmoid_decay, stepwise_decay


class DecayEstimator(BaseEstimator, MetaEstimatorMixin):
class DecayEstimator(MetaEstimatorMixin, BaseEstimator):
"""Morphs an estimator such that the training weights can be adapted to ensure that points that are far away have
less weight.
Expand Down Expand Up @@ -123,7 +124,8 @@ def fit(self, X, y):
"""

if self.check_input:
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, ensure_min_features=0)
X, y = validate_data(self, X, y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

if self.decay_func in self._ALLOWED_DECAYS.keys():
self.decay_func_ = self._ALLOWED_DECAYS[self.decay_func]
Expand All @@ -144,7 +146,6 @@ def fit(self, X, y):
if self._is_classifier():
self.classes_ = self.estimator_.classes_

self.n_features_in_ = X.shape[1]
return self

def predict(self, X):
Expand All @@ -169,3 +170,6 @@ def predict(self, X):
def score(self, X, y):
"""Alias for `.score()` method of the underlying estimator."""
return self.estimator_.score(X, y)

def __sklearn_tags__(self):
return self.model.__sklearn_tags__()
11 changes: 9 additions & 2 deletions sklego/meta/estimator_transformer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from sklearn import clone
from sklearn.base import BaseEstimator, MetaEstimatorMixin, TransformerMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted, check_X_y
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class EstimatorTransformer(TransformerMixin, MetaEstimatorMixin, BaseEstimator):
Expand Down Expand Up @@ -52,7 +53,9 @@ def fit(self, X, y, **kwargs):
"""

if self.check_input:
X, y = check_X_y(X, y, estimator=self, dtype=FLOAT_DTYPES, multi_output=True)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, multi_output=True, reset=True)

_check_n_features(self, X, reset=True)

self.multi_output_ = len(y.shape) > 1
self.estimator_ = clone(self.estimator)
Expand All @@ -76,5 +79,9 @@ def transform(self, X):
"""

check_is_fitted(self, "estimator_")
if self.check_input:
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

output = getattr(self.estimator_, self.predict_func)(X)
return output if self.multi_output_ else output.reshape(-1, 1)
5 changes: 5 additions & 0 deletions sklego/meta/grouped_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,11 @@ def _estimator_type(self):
def _more_tags(self):
return {"allow_nan": True}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = True
return tags


class GroupedRegressor(RegressorMixin, GroupedPredictor):
"""`GroupedRegressor` is a meta-estimator that fits a separate regressor for each group in the input data.
Expand Down
5 changes: 5 additions & 0 deletions sklego/meta/grouped_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,11 @@ def transform(self, X):
def _more_tags(self):
return {"allow_nan": True}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = True
return tags

def get_feature_names_out(self) -> List[str]:
"Alias for the `feature_names_out_` attribute defined during fit."
return self.feature_names_out_
16 changes: 12 additions & 4 deletions sklego/meta/hierarchical_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ class HierarchicalPredictor(ShrinkageMixin, MetaEstimatorMixin, BaseEstimator):
Number of features in the training data.
n_features_ : int
Number of features used by the estimators.
n_levels_ : int
n_fitted_levels_ : int
Number of hierarchical levels in the grouping.
"""

Expand Down Expand Up @@ -341,8 +341,8 @@ def _predict_estimators(self, X, method_name):
else: # binary case with `method_name = "decision_function"`
n_out = 1

preds = np.zeros((X.shape[0], self.n_levels_, n_out), dtype=float)
shrinkage = np.zeros((X.shape[0], self.n_levels_), dtype=float)
preds = np.zeros((X.shape[0], self.n_fitted_levels_, n_out), dtype=float)
shrinkage = np.zeros((X.shape[0], self.n_fitted_levels_), dtype=float)

for level_idx, grp_names in enumerate(self.fitted_levels_):
for grp_values, grp_frame in frame.group_by(grp_names):
Expand All @@ -363,7 +363,10 @@ def _predict_estimators(self, X, method_name):

preds[np.ix_(grp_idx, [level_idx], last_dim_ix)] = np.atleast_3d(raw_pred[:, None])
shrinkage[np.ix_(grp_idx)] = np.pad(
_shrinkage_factor, (0, self.n_levels_ - len(_shrinkage_factor)), "constant", constant_values=(0)
_shrinkage_factor,
(0, self.n_fitted_levels_ - len(_shrinkage_factor)),
"constant",
constant_values=(0),
)

return (preds * np.atleast_3d(shrinkage)).sum(axis=1).squeeze()
Expand Down Expand Up @@ -423,6 +426,11 @@ def n_levels_(self):
def _more_tags(self):
return {"allow_nan": True}

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = True
return tags


class HierarchicalRegressor(RegressorMixin, HierarchicalPredictor):
"""A hierarchical regressor that predicts values using hierarchical grouping.
Expand Down
14 changes: 7 additions & 7 deletions sklego/meta/ordinal_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from sklearn import clone
from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, MultiOutputMixin, is_classifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class OrdinalClassifier(MultiOutputMixin, ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
Expand Down Expand Up @@ -129,10 +130,10 @@ def fit(self, X, y):
if not hasattr(self.estimator, "predict_proba"):
raise ValueError("The estimator must implement `.predict_proba()` method.")

X, y = check_X_y(X, y, estimator=self, ensure_min_samples=2)
X, y = validate_data(self, X=X, y=y, ensure_min_samples=2, ensure_2d=True, reset=True)
_check_n_features(self, X, reset=True)

self.classes_ = np.sort(np.unique(y))
self.n_features_in_ = X.shape[1]

if self.n_classes_ < 3:
raise ValueError("`OrdinalClassifier` can't train when less than 3 classes are present.")
Expand Down Expand Up @@ -172,10 +173,8 @@ def predict_proba(self, X):
If `X` has a different number of features than the one seen during `fit`.
"""
check_is_fitted(self, ["estimators_", "classes_"])
X = check_array(X, ensure_2d=True, estimator=self)

if X.shape[1] != self.n_features_in_:
raise ValueError(f"X has {X.shape[1]} features, expected {self.n_features_in_} features.")
X = validate_data(self, X=X, ensure_2d=True, reset=False)
_check_n_features(self, X, reset=False)

raw_proba = np.array([estimator.predict_proba(X)[:, 1] for estimator in self.estimators_.values()]).T
p_y_le = np.column_stack((np.zeros(X.shape[0]), raw_proba, np.ones(X.shape[0])))
Expand All @@ -197,6 +196,7 @@ def predict(self, X):
The predicted class labels.
"""
check_is_fitted(self, ["estimators_", "classes_"])
X = validate_data(self, X=X, ensure_2d=True, reset=False)
return self.classes_[np.argmax(self.predict_proba(X), axis=1)]

def _fit_binary_estimator(self, X, y, y_label):
Expand Down
10 changes: 8 additions & 2 deletions sklego/meta/outlier_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from sklearn import clone
from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
from sklearn.calibration import _SigmoidCalibration
from sklearn.utils.validation import check_is_fitted, check_X_y
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import validate_data

from sklego.base import OutlierModel

Expand Down Expand Up @@ -87,7 +88,10 @@ def fit(self, X, y=None):
f"Passed model {self.model} does not have a `decision_function` "
f"method. This is required for `predict_proba` estimation."
)
X, y = check_X_y(X, y)
if y is not None:
X, y = validate_data(self, X=X, y=y, reset=True)
else:
X = validate_data(self, X=X, reset=True)
self.estimator_ = clone(self.model).fit(X, y)
self.n_features_in_ = self.estimator_.n_features_in_
self.classes_ = np.array([0, 1])
Expand All @@ -112,6 +116,7 @@ def predict(self, X):
The predicted values. 0 for inliers, 1 for outliers.
"""
check_is_fitted(self, ["estimator_", "classes_"])
X = validate_data(self, X=X, reset=False)
preds = self.estimator_.predict(X)
result = (preds == -1).astype(int)
return result
Expand All @@ -130,6 +135,7 @@ def predict_proba(self, X):
The predicted probabilities.
"""
check_is_fitted(self, ["estimator_", "classes_"])
X = validate_data(self, X=X, reset=False)
decision_function_scores = self.estimator_.decision_function(X)
probabilities = self._predict_proba_sigmoid.predict(decision_function_scores).reshape(-1, 1)
complement = np.ones_like(probabilities) - probabilities
Expand Down
16 changes: 10 additions & 6 deletions sklego/meta/regression_outlier_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import numpy as np
from sklearn import clone
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.utils.validation import check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class RegressionOutlierDetector(OutlierMixin, BaseEstimator):
Expand Down Expand Up @@ -135,9 +136,8 @@ def fit(self, X, y=None):
"""
X = nw.from_native(X, eager_only=True, strict=False)
self.idx_ = np.argmax([i == self.column for i in X.columns]) if isinstance(X, nw.DataFrame) else self.column
X = check_array(nw.to_native(X, strict=False), estimator=self)

self.n_features_in_ = X.shape[1]
X = validate_data(self, nw.to_native(X, strict=False), reset=True)
_check_n_features(self, X, reset=True)

if not self._is_regression_model():
raise ValueError("Passed model must be regression!")
Expand All @@ -164,7 +164,9 @@ def predict(self, X, y=None):
The predicted values. 1 for inliers, -1 for outliers.
"""
check_is_fitted(self, ["estimator_", "sd_", "idx_"])
X = check_array(X, estimator=self)
X = validate_data(self, X=X, reset=False)
_check_n_features(self, X, reset=False)

X, y = self.to_x_y(X)
preds = self.estimator_.predict(X)
return self._handle_thresholds(y, preds)
Expand All @@ -190,7 +192,9 @@ def score_samples(self, X, y=None):
If `method` is not one of "sd", "relative", or "absolute".
"""
check_is_fitted(self, ["estimator_", "sd_", "idx_"])
X = check_array(X, estimator=self)
X = validate_data(self, X=X, reset=False)
_check_n_features(self, X, reset=False)

X, y_true = self.to_x_y(X)
y_pred = self.estimator_.predict(X)
difference = y_true - y_pred
Expand Down
16 changes: 11 additions & 5 deletions sklego/meta/subjective_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from sklearn.base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted, check_X_y
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
from sklearn_compat.utils.validation import _check_n_features, validate_data


class SubjectiveClassifier(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
Expand Down Expand Up @@ -109,7 +110,9 @@ def fit(self, X, y):
if self.evidence not in self._ALLOWED_EVIDENCE:
raise ValueError(f"Invalid evidence: the provided evidence should be one of {self._ALLOWED_EVIDENCE}")

X, y = check_X_y(X, y, estimator=self.estimator, dtype=FLOAT_DTYPES)
X, y = validate_data(self, X=X, y=y, dtype=FLOAT_DTYPES, reset=True)
_check_n_features(self, X, reset=True)

if set(y) - set(self.prior.keys()):
raise ValueError(
f"Training data is inconsistent with prior: no prior defined for classes "
Expand All @@ -120,7 +123,6 @@ def fit(self, X, y):
self.posterior_matrix_ = np.array(
[[self._posterior(y, y_hat, cfm) for y_hat in range(cfm.shape[0])] for y in range(cfm.shape[0])]
)
self.n_features_in_ = X.shape[1]
return self

@staticmethod
Expand All @@ -147,7 +149,9 @@ def predict_proba(self, X):
The predicted probabilities.
"""
check_is_fitted(self, ["posterior_matrix_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

y_hats = self.estimator_.predict_proba(X) # these are ignorant of the prior

if self.evidence == "predict_proba":
Expand All @@ -171,7 +175,9 @@ def predict(self, X):
The predicted class.
"""
check_is_fitted(self, ["posterior_matrix_"])
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X=X, dtype=FLOAT_DTYPES, reset=False)
_check_n_features(self, X, reset=False)

return self.classes_[self.predict_proba(X).argmax(axis=1)]

@property
Expand Down
Loading

0 comments on commit e1b2520

Please sign in to comment.