From 05aff137673f85cb2c376c819d97b69f0a487b28 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 21 Jan 2025 18:40:47 +0100 Subject: [PATCH] Rename files --- src/hidimstat/permutation_importance.py | 355 +++++++++--------- .../permutation_importance_function.py | 203 ---------- .../permutation_importance_scikitlearn.py | 309 --------------- test/test_permutation_importance.py | 90 +++-- test/test_permutation_importance_function.py | 120 ------ ...test_permutation_importance_scikitlearn.py | 75 ---- 6 files changed, 243 insertions(+), 909 deletions(-) delete mode 100644 src/hidimstat/permutation_importance_function.py delete mode 100644 src/hidimstat/permutation_importance_scikitlearn.py delete mode 100644 test/test_permutation_importance_function.py delete mode 100644 test/test_permutation_importance_scikitlearn.py diff --git a/src/hidimstat/permutation_importance.py b/src/hidimstat/permutation_importance.py index ad6c9c1c..b6a76cef 100644 --- a/src/hidimstat/permutation_importance.py +++ b/src/hidimstat/permutation_importance.py @@ -1,192 +1,203 @@ import numpy as np -import pandas as pd from joblib import Parallel, delayed -from sklearn.base import BaseEstimator, check_is_fitted from sklearn.metrics import root_mean_squared_error +from sklearn.exceptions import NotFittedError +from sklearn.utils import check_random_state +from sklearn.base import clone from hidimstat.utils import _check_vim_predict_method -class PermutationImportance(BaseEstimator): +def permutation_importance( + X, + y, + estimator, + n_permutations: int = 50, + loss: callable = root_mean_squared_error, + method: str = "predict", + random_state: int = None, + n_jobs: int = None, + groups=None, +): """ + # Permutation importance + + Calculate permutation importance scores for features or feature groups in a machine learning model. + Permutation importance is a model inspection technique that measures the increase in the model's + prediction error after permuting a feature's values. A feature is considered "important" if shuffling + its values increases the model error, because the model relied on the feature for the prediction. + The implementation follows the methodology described in chapter 10 :cite:breimanRandomForests2001. + One implementation: https://github.com/SkadiEye/deepTL/blob/master/R/4-2-permfit.R + Parameters ---------- - estimator: scikit-learn compatible estimator - The predictive model. - n_permutations: int, default=50 - Number of permutations to perform. - loss: callable, default=root_mean_squared_error - Loss function to evaluate the model performance. - method: str, default='predict' - Method to use for predicting values that will be used to compute - the loss and the importance scores. The method must be implemented by the - estimator. Supported methods are 'predict', 'predict_proba', - 'decision_function' and 'transform'. - random_state: int, default=None - Random seed for the permutation. - n_jobs: int, default=1 - Number of jobs to run in parallel. + X : np.ndarray of shape (n_samples, n_features) + Training data. Can be numpy array or pandas DataFrame. + y : np.ndarray of shape (n_samples,) + Target values for the model. + estimator : object + A fitted estimator object implementing scikit-learn estimator interface. + The estimator must have a fitting method and one of the following prediction methods: + 'predict', 'predict_proba', 'decision_function', or 'transform'. + n_permutations : int, default=50 + Number of times to permute each feature or feature group. + Higher values give more stable results but take longer to compute. + loss : callable, default=root_mean_squared_error + Function to measure the prediction error. Must take two arguments (y_true, y_pred) + and return a scalar value. Higher return values must indicate worse predictions. + method : str, default='predict' + The estimator method used for prediction. Must be one of: + - 'predict': Use estimator.predict() + - 'predict_proba': Use estimator.predict_proba() + - 'decision_function': Use estimator.decision_function() + - 'transform': Use estimator.transform() + random_state : int, default=None + Controls the randomness of the feature permutations. + Pass an int for reproducible results across multiple function calls. + n_jobs : int, default=None + Number of jobs to run in parallel. None means 1 unless in a joblib.parallel_backend context. + -1 means using all processors. + groups : dict, default=None + Dictionary specifying feature groups. Keys are group names and values are lists of feature + indices or feature names (if X is a pandas DataFrame). If None, each feature is treated + as its own group. + + Returns + ------- + importance : np.ndarray of shape (n_features,) or (n_groups,) + The importance scores for each feature or feature group. + Higher values indicate more important features. + list_loss_j : np.ndarray + Array containing all computed loss values for each permutation of each feature/group. + loss_reference : float + The reference loss (baseline) computed on the original, non-permuted data. + + Notes + ----- + The implementation supports both individual feature importance and group feature importance. + For group importance, features within the same group are permuted together. References ---------- .. footbibliography:: """ - def __init__( - self, - estimator, - n_permutations: int = 50, - loss: callable = root_mean_squared_error, - method: str = "predict", - random_state: int = None, - n_jobs: int = 1, - ): - - check_is_fitted(estimator) - self.estimator = estimator - self.n_permutations = n_permutations - - self.random_state = random_state - self.loss = loss - _check_vim_predict_method(method) - self.method = method - self.n_jobs = n_jobs - self.rng = np.random.RandomState(random_state) - self.n_groups = None - - def fit(self, X, y=None, groups=None): - """ - Parameters - ---------- - X: np.ndarray of shape (n_samples, n_features) - The input samples. Not used here. - y: np.ndarray of shape (n_samples,) - The target values. Not used here. - groups: dict, default=None - Dictionary of groups for the covariates. The keys are the group names - and the values are lists of covariate indices. - """ - self.groups = groups - return self - - def predict(self, X, y=None): - """ - Compute the prediction of the model with permuted data for each group. - - Parameters - ---------- - X: array-like of shape (n_samples, n_features) - The input samples. - y: array-like of shape (n_samples,) - The target values. - - Returns - ------- - premuted_y_pred: np.ndarray of shape (n_groups, n_permutations, n_samples) - The predictions of the model with permuted data for each group - - """ - check_is_fitted(self.estimator) - if self.groups is None: - self.n_groups = X.shape[1] - self.groups = {j: [j] for j in range(self.n_groups)} + # check parameters + _check_vim_predict_method(method) + + # define a random generator + check_random_state(random_state) + rng = np.random.RandomState(random_state) + + # management of the group + if groups is None: + n_groups = X.shape[1] + groups_ = {j: [j] for j in range(n_groups)} + else: + n_groups = len(groups) + if type(list(groups.values())[0][0]) is str: + groups_ = {} + for key, indexe_names in zip(groups.keys(), groups.values()): + groups_[key] = [] + for index_name in indexe_names: + index = np.where(index_name == X.columns)[0] + assert len(index) == 1 + groups_[key].append(index) else: - self.n_groups = len(self.groups) - - def _joblib_predict_one_group(X, j): - """ - Compute the importance score for a single group of covariates. - """ - if isinstance(X, pd.DataFrame): - X_j = X[self.groups[j]].copy().values - X_minus_j = X.drop(columns=self.groups[j]).values - group_ids = [ - i for i, col in enumerate(X.columns) if col in self.groups[j] - ] - non_group_ids = [ - i for i, col in enumerate(X.columns) if col not in self.groups[j] - ] - else: - X_j = X[:, self.groups[j]].copy() - X_minus_j = np.delete(X, self.groups[j], axis=1) - group_ids = self.groups[j] - non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) - - # Create an array X_perm_j of shape (n_permutations, n_samples, n_features) - # where the j-th group of covariates is permuted - X_perm_j = np.empty((self.n_permutations, X.shape[0], X.shape[1])) - X_perm_j[:, :, non_group_ids] = X_minus_j - # Create the permuted data for the j-th group of covariates - group_j_permuted = np.array( - [self.rng.permutation(X_j) for _ in range(self.n_permutations)] - ) - X_perm_j[:, :, group_ids] = group_j_permuted - # Reshape X_perm_j to allow for batch prediction - X_perm_batch = X_perm_j.reshape(-1, X.shape[1]) - if isinstance(X, pd.DataFrame): - X_perm_batch = pd.DataFrame( - X_perm_batch.reshape(-1, X.shape[1]), columns=X.columns - ) - y_pred_perm = getattr(self.estimator, self.method)(X_perm_batch) - - # In case of classification, the output is a 2D array. Reshape accordingly - if y_pred_perm.ndim == 1: - y_pred_perm = y_pred_perm.reshape(self.n_permutations, X.shape[0]) - else: - y_pred_perm = y_pred_perm.reshape( - self.n_permutations, X.shape[0], y_pred_perm.shape[1] - ) - return y_pred_perm - - # Parallelize the computation of the importance scores for each group - out_list = Parallel(n_jobs=self.n_jobs)( - delayed(_joblib_predict_one_group)(X, j) for j in self.groups.keys() + groups_ = groups + + X_ = np.asarray(X) # avoid the management of panda dataframe + + # compute the reference residual + try: + y_pred = getattr(estimator, method)(X) + estimator_ = estimator + except NotFittedError: + estimator_ = clone(estimator) + # case for not fitted esimator + estimator_.fit(X_, y) + y_pred = getattr(estimator_, method)(X) + loss_reference = loss(y, y_pred) + + # Parallelize the computation of the residual for each permutation + # of each group + list_loss_j = Parallel(n_jobs=n_jobs)( + delayed(_predict_one_group)( + estimator_, + groups_[j], + X_, + y, + loss, + n_permutations, + rng, + method, ) + for j in groups_.keys() + ) + list_loss_j = np.array(list_loss_j) - premuted_y_pred = np.stack(out_list, axis=0) - return premuted_y_pred - - def score(self, X, y): - """ - Compute the importance scores for each group of covariates. - - Parameters - ---------- - X: array-like of shape (n_samples, n_features) - The input samples. - y: array-like of shape (n_samples,) - The target values. - - Returns - ------- - out_dict: dict - A dictionary containing the following keys: - - 'loss_reference': the loss of the model with the original data. - - 'loss_perm': a dictionary containing the loss of the model with - the permuted data for each group. - - 'importance': the importance scores for each group. - """ - check_is_fitted(self.estimator) - - output_dict = dict() - y_pred = getattr(self.estimator, self.method)(X) - loss_reference = self.loss(y, y_pred) - output_dict["loss_reference"] = loss_reference - - y_pred_perm = self.predict(X, y) - - output_dict["loss_perm"] = dict() - for j, y_pred_j in enumerate(y_pred_perm): - list_loss_perm = [] - for y_pred_perm in y_pred_j: - list_loss_perm.append(self.loss(y, y_pred_perm)) - output_dict["loss_perm"][j] = np.array(list_loss_perm) - - output_dict["importance"] = np.array( - [ - np.mean(output_dict["loss_perm"][j]) - output_dict["loss_reference"] - for j in range(self.n_groups) - ] - ) + # compute the importance + # equation 5 of mi2021permutation + importance = np.mean(list_loss_j - loss_reference, axis=1) - return output_dict + return importance, list_loss_j, loss_reference + + +def _predict_one_group(estimator, group_ids, X, y, loss, n_permutations, rng, method): + """ + Compute prediction loss scores after permuting a single group of features. + + Parameters + ---------- + estimator : object + Fitted estimator implementing scikit-learn API + group_ids : list + Indices of features in the group to permute + X : np.ndarray + Input data matrix + y : np.ndarray + Target values + loss : callable + Loss function to evaluate predictions + n_permutations : int + Number of permutations to perform + rng : RandomState + Random number generator instance + method : str + Prediction method to use ('predict', 'predict_proba', etc.) + + Returns + ------- + list + Loss values for each permutation + """ + # get ids + non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) + + # get data + X_j = X[:, group_ids].copy() + X_minus_j = np.delete(X, group_ids, axis=1) + + # Create an array X_perm_j of shape (n_permutations, n_samples, n_features) + # where the j-th group of covariates is permuted + X_perm_j = np.empty((n_permutations, X.shape[0], X.shape[1])) + X_perm_j[:, :, non_group_ids] = X_minus_j + + # Create the permuted data for the j-th group of covariates + group_j_permuted = np.array([rng.permutation(X_j) for _ in range(n_permutations)]) + X_perm_j[:, :, group_ids] = group_j_permuted + + # Reshape X_perm_j to allow for remove the indexation by groups + X_perm_batch = X_perm_j.reshape(-1, X.shape[1]) + y_pred_perm = getattr(estimator, method)(X_perm_batch) + + if y_pred_perm.ndim == 1: + # one value per y: regression + y_pred_perm = y_pred_perm.reshape(n_permutations, X.shape[0]) + else: + # probability per y: classification + y_pred_perm = y_pred_perm.reshape( + n_permutations, X.shape[0], y_pred_perm.shape[1] + ) + loss_i = [loss(y, y_pred_perm[i]) for i in range(n_permutations)] + return loss_i diff --git a/src/hidimstat/permutation_importance_function.py b/src/hidimstat/permutation_importance_function.py deleted file mode 100644 index b6a76cef..00000000 --- a/src/hidimstat/permutation_importance_function.py +++ /dev/null @@ -1,203 +0,0 @@ -import numpy as np -from joblib import Parallel, delayed -from sklearn.metrics import root_mean_squared_error -from sklearn.exceptions import NotFittedError -from sklearn.utils import check_random_state -from sklearn.base import clone - -from hidimstat.utils import _check_vim_predict_method - - -def permutation_importance( - X, - y, - estimator, - n_permutations: int = 50, - loss: callable = root_mean_squared_error, - method: str = "predict", - random_state: int = None, - n_jobs: int = None, - groups=None, -): - """ - # Permutation importance - - Calculate permutation importance scores for features or feature groups in a machine learning model. - Permutation importance is a model inspection technique that measures the increase in the model's - prediction error after permuting a feature's values. A feature is considered "important" if shuffling - its values increases the model error, because the model relied on the feature for the prediction. - The implementation follows the methodology described in chapter 10 :cite:breimanRandomForests2001. - One implementation: https://github.com/SkadiEye/deepTL/blob/master/R/4-2-permfit.R - - Parameters - ---------- - X : np.ndarray of shape (n_samples, n_features) - Training data. Can be numpy array or pandas DataFrame. - y : np.ndarray of shape (n_samples,) - Target values for the model. - estimator : object - A fitted estimator object implementing scikit-learn estimator interface. - The estimator must have a fitting method and one of the following prediction methods: - 'predict', 'predict_proba', 'decision_function', or 'transform'. - n_permutations : int, default=50 - Number of times to permute each feature or feature group. - Higher values give more stable results but take longer to compute. - loss : callable, default=root_mean_squared_error - Function to measure the prediction error. Must take two arguments (y_true, y_pred) - and return a scalar value. Higher return values must indicate worse predictions. - method : str, default='predict' - The estimator method used for prediction. Must be one of: - - 'predict': Use estimator.predict() - - 'predict_proba': Use estimator.predict_proba() - - 'decision_function': Use estimator.decision_function() - - 'transform': Use estimator.transform() - random_state : int, default=None - Controls the randomness of the feature permutations. - Pass an int for reproducible results across multiple function calls. - n_jobs : int, default=None - Number of jobs to run in parallel. None means 1 unless in a joblib.parallel_backend context. - -1 means using all processors. - groups : dict, default=None - Dictionary specifying feature groups. Keys are group names and values are lists of feature - indices or feature names (if X is a pandas DataFrame). If None, each feature is treated - as its own group. - - Returns - ------- - importance : np.ndarray of shape (n_features,) or (n_groups,) - The importance scores for each feature or feature group. - Higher values indicate more important features. - list_loss_j : np.ndarray - Array containing all computed loss values for each permutation of each feature/group. - loss_reference : float - The reference loss (baseline) computed on the original, non-permuted data. - - Notes - ----- - The implementation supports both individual feature importance and group feature importance. - For group importance, features within the same group are permuted together. - - References - ---------- - .. footbibliography:: - """ - - # check parameters - _check_vim_predict_method(method) - - # define a random generator - check_random_state(random_state) - rng = np.random.RandomState(random_state) - - # management of the group - if groups is None: - n_groups = X.shape[1] - groups_ = {j: [j] for j in range(n_groups)} - else: - n_groups = len(groups) - if type(list(groups.values())[0][0]) is str: - groups_ = {} - for key, indexe_names in zip(groups.keys(), groups.values()): - groups_[key] = [] - for index_name in indexe_names: - index = np.where(index_name == X.columns)[0] - assert len(index) == 1 - groups_[key].append(index) - else: - groups_ = groups - - X_ = np.asarray(X) # avoid the management of panda dataframe - - # compute the reference residual - try: - y_pred = getattr(estimator, method)(X) - estimator_ = estimator - except NotFittedError: - estimator_ = clone(estimator) - # case for not fitted esimator - estimator_.fit(X_, y) - y_pred = getattr(estimator_, method)(X) - loss_reference = loss(y, y_pred) - - # Parallelize the computation of the residual for each permutation - # of each group - list_loss_j = Parallel(n_jobs=n_jobs)( - delayed(_predict_one_group)( - estimator_, - groups_[j], - X_, - y, - loss, - n_permutations, - rng, - method, - ) - for j in groups_.keys() - ) - list_loss_j = np.array(list_loss_j) - - # compute the importance - # equation 5 of mi2021permutation - importance = np.mean(list_loss_j - loss_reference, axis=1) - - return importance, list_loss_j, loss_reference - - -def _predict_one_group(estimator, group_ids, X, y, loss, n_permutations, rng, method): - """ - Compute prediction loss scores after permuting a single group of features. - - Parameters - ---------- - estimator : object - Fitted estimator implementing scikit-learn API - group_ids : list - Indices of features in the group to permute - X : np.ndarray - Input data matrix - y : np.ndarray - Target values - loss : callable - Loss function to evaluate predictions - n_permutations : int - Number of permutations to perform - rng : RandomState - Random number generator instance - method : str - Prediction method to use ('predict', 'predict_proba', etc.) - - Returns - ------- - list - Loss values for each permutation - """ - # get ids - non_group_ids = np.delete(np.arange(X.shape[1]), group_ids) - - # get data - X_j = X[:, group_ids].copy() - X_minus_j = np.delete(X, group_ids, axis=1) - - # Create an array X_perm_j of shape (n_permutations, n_samples, n_features) - # where the j-th group of covariates is permuted - X_perm_j = np.empty((n_permutations, X.shape[0], X.shape[1])) - X_perm_j[:, :, non_group_ids] = X_minus_j - - # Create the permuted data for the j-th group of covariates - group_j_permuted = np.array([rng.permutation(X_j) for _ in range(n_permutations)]) - X_perm_j[:, :, group_ids] = group_j_permuted - - # Reshape X_perm_j to allow for remove the indexation by groups - X_perm_batch = X_perm_j.reshape(-1, X.shape[1]) - y_pred_perm = getattr(estimator, method)(X_perm_batch) - - if y_pred_perm.ndim == 1: - # one value per y: regression - y_pred_perm = y_pred_perm.reshape(n_permutations, X.shape[0]) - else: - # probability per y: classification - y_pred_perm = y_pred_perm.reshape( - n_permutations, X.shape[0], y_pred_perm.shape[1] - ) - loss_i = [loss(y, y_pred_perm[i]) for i in range(n_permutations)] - return loss_i diff --git a/src/hidimstat/permutation_importance_scikitlearn.py b/src/hidimstat/permutation_importance_scikitlearn.py deleted file mode 100644 index 26c9f269..00000000 --- a/src/hidimstat/permutation_importance_scikitlearn.py +++ /dev/null @@ -1,309 +0,0 @@ -"""Permutation importance for estimators.""" - -import numbers - -import numpy as np - -from sklearn.ensemble._bagging import _generate_indices -from sklearn.metrics import check_scoring, get_scorer_names -from sklearn.model_selection._validation import _aggregate_score_dicts -from sklearn.utils import Bunch, _safe_indexing, check_array, check_random_state -from sklearn.utils._param_validation import ( - HasMethods, - Integral, - Interval, - RealNotInt, - StrOptions, - validate_params, -) -from sklearn.utils.parallel import Parallel, delayed - - -def _weights_scorer(scorer, estimator, X, y, sample_weight): - if sample_weight is not None: - return scorer(estimator, X, y, sample_weight=sample_weight) - return scorer(estimator, X, y) - - -def _calculate_permutation_scores( - estimator, - X, - y, - sample_weight, - col_idx, - random_state, - n_repeats, - scorer, - max_samples, -): - """Calculate score when `col_idx` is permuted.""" - random_state = check_random_state(random_state) - - # Work on a copy of X to ensure thread-safety in case of threading based - # parallelism. Furthermore, making a copy is also useful when the joblib - # backend is 'loky' (default) or the old 'multiprocessing': in those cases, - # if X is large it will be automatically be backed by a readonly memory map - # (memmap). X.copy() on the other hand is always guaranteed to return a - # writable data-structure whose columns can be shuffled inplace. - if max_samples < X.shape[0]: - row_indices = _generate_indices( - random_state=random_state, - bootstrap=False, - n_population=X.shape[0], - n_samples=max_samples, - ) - X_permuted = _safe_indexing(X, row_indices, axis=0) - y = _safe_indexing(y, row_indices, axis=0) - if sample_weight is not None: - sample_weight = _safe_indexing(sample_weight, row_indices, axis=0) - else: - X_permuted = X.copy() - - scores = [] - shuffling_idx = np.arange(X_permuted.shape[0]) - for _ in range(n_repeats): - random_state.shuffle(shuffling_idx) - if hasattr(X_permuted, "iloc"): - col = X_permuted.iloc[shuffling_idx, col_idx] - col.index = X_permuted.index - X_permuted[X_permuted.columns[col_idx]] = col - else: - X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] - scores.append(_weights_scorer(scorer, estimator, X_permuted, y, sample_weight)) - - if isinstance(scores[0], dict): - scores = _aggregate_score_dicts(scores) - else: - scores = np.array(scores) - - return scores - - -def _create_importances_bunch(baseline_score, permuted_score): - """Compute the importances as the decrease in score. - - Parameters - ---------- - baseline_score : ndarray of shape (n_features,) - The baseline score without permutation. - permuted_score : ndarray of shape (n_features, n_repeats) - The permuted scores for the `n` repetitions. - - Returns - ------- - importances : :class:`~sklearn.utils.Bunch` - Dictionary-like object, with the following attributes. - importances_mean : ndarray, shape (n_features, ) - Mean of feature importance over `n_repeats`. - importances_std : ndarray, shape (n_features, ) - Standard deviation over `n_repeats`. - importances : ndarray, shape (n_features, n_repeats) - Raw permutation importance scores. - """ - importances = baseline_score - permuted_score - return Bunch( - importances_mean=np.mean(importances, axis=1), - importances_std=np.std(importances, axis=1), - importances=importances, - ) - - -@validate_params( - { - "estimator": [HasMethods(["fit"])], - "X": ["array-like"], - "y": ["array-like", None], - "scoring": [ - StrOptions(set(get_scorer_names())), - callable, - list, - tuple, - dict, - None, - ], - "n_repeats": [Interval(Integral, 1, None, closed="left")], - "n_jobs": [Integral, None], - "random_state": ["random_state"], - "sample_weight": ["array-like", None], - "max_samples": [ - Interval(Integral, 1, None, closed="left"), - Interval(RealNotInt, 0, 1, closed="right"), - ], - }, - prefer_skip_nested_validation=True, -) -def permutation_importance( - estimator, - X, - y, - *, - scoring=None, - n_repeats=5, - n_jobs=None, - random_state=None, - sample_weight=None, - max_samples=1.0, -): - """Permutation importance for feature evaluation [BRE]_. - - The :term:`estimator` is required to be a fitted estimator. `X` can be the - data set used to train the estimator or a hold-out set. The permutation - importance of a feature is calculated as follows. First, a baseline metric, - defined by :term:`scoring`, is evaluated on a (potentially different) - dataset defined by the `X`. Next, a feature column from the validation set - is permuted and the metric is evaluated again. The permutation importance - is defined to be the difference between the baseline metric and metric from - permutating the feature column. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - estimator : object - An estimator that has already been :term:`fitted` and is compatible - with :term:`scorer`. - - X : ndarray or DataFrame, shape (n_samples, n_features) - Data on which permutation importance will be computed. - - y : array-like or None, shape (n_samples, ) or (n_samples, n_classes) - Targets for supervised or `None` for unsupervised. - - scoring : str, callable, list, tuple, or dict, default=None - Scorer to use. - If `scoring` represents a single score, one can use: - - - a single string (see :ref:`scoring_parameter`); - - a callable (see :ref:`scoring`) that returns a single value. - - If `scoring` represents multiple scores, one can use: - - - a list or tuple of unique strings; - - a callable returning a dictionary where the keys are the metric - names and the values are the metric scores; - - a dictionary with metric names as keys and callables a values. - - Passing multiple scores to `scoring` is more efficient than calling - `permutation_importance` for each of the scores as it reuses - predictions to avoid redundant computation. - - If None, the estimator's default scorer is used. - - n_repeats : int, default=5 - Number of times to permute a feature. - - n_jobs : int or None, default=None - Number of jobs to run in parallel. The computation is done by computing - permutation score for each columns and parallelized over the columns. - `None` means 1 unless in a :obj:`joblib.parallel_backend` context. - `-1` means using all processors. See :term:`Glossary ` - for more details. - - random_state : int, RandomState instance, default=None - Pseudo-random number generator to control the permutations of each - feature. - Pass an int to get reproducible results across function calls. - See :term:`Glossary `. - - sample_weight : array-like of shape (n_samples,), default=None - Sample weights used in scoring. - - .. versionadded:: 0.24 - - max_samples : int or float, default=1.0 - The number of samples to draw from X to compute feature importance - in each repeat (without replacement). - - - If int, then draw `max_samples` samples. - - If float, then draw `max_samples * X.shape[0]` samples. - - If `max_samples` is equal to `1.0` or `X.shape[0]`, all samples - will be used. - - While using this option may provide less accurate importance estimates, - it keeps the method tractable when evaluating feature importance on - large datasets. In combination with `n_repeats`, this allows to control - the computational speed vs statistical accuracy trade-off of this method. - - .. versionadded:: 1.0 - - Returns - ------- - result : :class:`~sklearn.utils.Bunch` or dict of such instances - Dictionary-like object, with the following attributes. - - importances_mean : ndarray of shape (n_features, ) - Mean of feature importance over `n_repeats`. - importances_std : ndarray of shape (n_features, ) - Standard deviation over `n_repeats`. - importances : ndarray of shape (n_features, n_repeats) - Raw permutation importance scores. - - If there are multiple scoring metrics in the scoring parameter - `result` is a dict with scorer names as keys (e.g. 'roc_auc') and - `Bunch` objects like above as values. - - References - ---------- - .. [BRE] :doi:`L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, - 2001. <10.1023/A:1010933404324>` - - Examples - -------- - >>> from sklearn.linear_model import LogisticRegression - >>> from sklearn.inspection import permutation_importance - >>> X = [[1, 9, 9],[1, 9, 9],[1, 9, 9], - ... [0, 9, 9],[0, 9, 9],[0, 9, 9]] - >>> y = [1, 1, 1, 0, 0, 0] - >>> clf = LogisticRegression().fit(X, y) - >>> result = permutation_importance(clf, X, y, n_repeats=10, - ... random_state=0) - >>> result.importances_mean - array([0.4666..., 0. , 0. ]) - >>> result.importances_std - array([0.2211..., 0. , 0. ]) - """ - if not hasattr(X, "iloc"): - X = check_array(X, force_all_finite="allow-nan", dtype=None) - - # Precompute random seed from the random state to be used - # to get a fresh independent RandomState instance for each - # parallel call to _calculate_permutation_scores, irrespective of - # the fact that variables are shared or not depending on the active - # joblib backend (sequential, thread-based or process-based). - random_state = check_random_state(random_state) - random_seed = random_state.randint(np.iinfo(np.int32).max + 1) - - if not isinstance(max_samples, numbers.Integral): - max_samples = int(max_samples * X.shape[0]) - elif max_samples > X.shape[0]: - raise ValueError("max_samples must be <= n_samples") - - scorer = check_scoring(estimator, scoring=scoring) - baseline_score = _weights_scorer(scorer, estimator, X, y, sample_weight) - - scores = Parallel(n_jobs=n_jobs)( - delayed(_calculate_permutation_scores)( - estimator, - X, - y, - sample_weight, - col_idx, - random_seed, - n_repeats, - scorer, - max_samples, - ) - for col_idx in range(X.shape[1]) - ) - - if isinstance(baseline_score, dict): - return { - name: _create_importances_bunch( - baseline_score[name], - # unpack the permuted scores - np.array([scores[col_idx][name] for col_idx in range(X.shape[1])]), - ) - for name in baseline_score - } - else: - return _create_importances_bunch(baseline_score, np.array(scores)) diff --git a/test/test_permutation_importance.py b/test/test_permutation_importance.py index 284d49d7..a0c76bae 100644 --- a/test/test_permutation_importance.py +++ b/test/test_permutation_importance.py @@ -4,10 +4,10 @@ from sklearn.metrics import log_loss from sklearn.model_selection import train_test_split -from hidimstat.permutation_importance import PermutationImportance +from hidimstat.permutation_importance_function import permutation_importance -def test_permutation_importance(linear_scenario): +def test_permutation_importance_no_fitting(linear_scenario): X, y, beta = linear_scenario important_features = np.where(beta != 0)[0] non_important_features = np.where(beta == 0)[0] @@ -15,9 +15,9 @@ def test_permutation_importance(linear_scenario): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) regression_model = LinearRegression() - regression_model.fit(X_train, y_train) - - pi = PermutationImportance( + importance, list_loss_j, loss_reference = permutation_importance( + X_test, + y_test, estimator=regression_model, n_permutations=20, method="predict", @@ -25,14 +25,6 @@ def test_permutation_importance(linear_scenario): n_jobs=1, ) - pi.fit( - X_train, - y_train, - groups=None, - ) - vim = pi.score(X_test, y_test) - - importance = vim["importance"] assert importance.shape == (X.shape[1],) assert ( importance[important_features].mean() @@ -46,45 +38,83 @@ def test_permutation_importance(linear_scenario): } X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) X_train_df, X_test_df, y_train, y_test = train_test_split(X_df, y, random_state=0) - regression_model.fit(X_train_df, y_train) - pi = PermutationImportance( + regression_model = LinearRegression() + importance, list_loss_j, loss_reference = permutation_importance( + X_test_df, + y_test, estimator=regression_model, n_permutations=20, method="predict", random_state=0, n_jobs=1, + groups=groups, ) - pi.fit( - X_train_df, - y_train, + + assert importance[0].mean() > importance[1].mean() + + # Same with groups + groups = { + "group_0": [i for i in important_features], + "the_group_1": [i for i in non_important_features], + } + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + regression_model = LinearRegression() + importance, list_loss_j, loss_reference = permutation_importance( + X_test, + y_test, + estimator=regression_model, + n_permutations=20, + method="predict", + random_state=0, + n_jobs=1, groups=groups, ) - vim = pi.score(X_test_df, y_test) - importance = vim["importance"] assert importance[0].mean() > importance[1].mean() # Classification case - y_clf = np.where(y > np.median(y), 1, 0) + y_clf = np.zeros_like(y) + for i, quantile in enumerate(np.arange(0.2, 0.8, 0.2)): + y_clf[np.where(y > np.quantile(y, quantile))] = i _, _, y_train_clf, y_test_clf = train_test_split(X, y_clf, random_state=0) logistic_model = LogisticRegression() - logistic_model.fit(X_train, y_train_clf) - pi_clf = PermutationImportance( + importance_clf, list_loss_j, loss_reference = permutation_importance( + X_test, + y_test_clf, estimator=logistic_model, n_permutations=20, method="predict_proba", random_state=0, n_jobs=1, loss=log_loss, - ) - - pi_clf.fit( - X_train, - y_train_clf, groups=None, ) - vim_clf = pi_clf.score(X_test, y_test_clf) - importance_clf = vim_clf["importance"] assert importance_clf.shape == (X.shape[1],) + + +def test_with_fitting(linear_scenario): + X, y, beta = linear_scenario + important_features = np.where(beta != 0)[0] + non_important_features = np.where(beta == 0)[0] + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + regression_model = LinearRegression() + regression_model.fit(X_train, y_train) + importance, list_loss_j, loss_reference = permutation_importance( + X_test, + y_test, + estimator=regression_model, + n_permutations=20, + method="predict", + random_state=0, + n_jobs=1, + ) + + assert importance.shape == (X.shape[1],) + assert ( + importance[important_features].mean() + > importance[non_important_features].mean() + ) diff --git a/test/test_permutation_importance_function.py b/test/test_permutation_importance_function.py deleted file mode 100644 index a0c76bae..00000000 --- a/test/test_permutation_importance_function.py +++ /dev/null @@ -1,120 +0,0 @@ -import numpy as np -import pandas as pd -from sklearn.linear_model import LinearRegression, LogisticRegression -from sklearn.metrics import log_loss -from sklearn.model_selection import train_test_split - -from hidimstat.permutation_importance_function import permutation_importance - - -def test_permutation_importance_no_fitting(linear_scenario): - X, y, beta = linear_scenario - important_features = np.where(beta != 0)[0] - non_important_features = np.where(beta == 0)[0] - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - - regression_model = LinearRegression() - importance, list_loss_j, loss_reference = permutation_importance( - X_test, - y_test, - estimator=regression_model, - n_permutations=20, - method="predict", - random_state=0, - n_jobs=1, - ) - - assert importance.shape == (X.shape[1],) - assert ( - importance[important_features].mean() - > importance[non_important_features].mean() - ) - - # Same with groups and a pd.DataFrame - groups = { - "group_0": [f"col_{i}" for i in important_features], - "the_group_1": [f"col_{i}" for i in non_important_features], - } - X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) - X_train_df, X_test_df, y_train, y_test = train_test_split(X_df, y, random_state=0) - regression_model = LinearRegression() - importance, list_loss_j, loss_reference = permutation_importance( - X_test_df, - y_test, - estimator=regression_model, - n_permutations=20, - method="predict", - random_state=0, - n_jobs=1, - groups=groups, - ) - - assert importance[0].mean() > importance[1].mean() - - # Same with groups - groups = { - "group_0": [i for i in important_features], - "the_group_1": [i for i in non_important_features], - } - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - regression_model = LinearRegression() - importance, list_loss_j, loss_reference = permutation_importance( - X_test, - y_test, - estimator=regression_model, - n_permutations=20, - method="predict", - random_state=0, - n_jobs=1, - groups=groups, - ) - - assert importance[0].mean() > importance[1].mean() - - # Classification case - y_clf = np.zeros_like(y) - for i, quantile in enumerate(np.arange(0.2, 0.8, 0.2)): - y_clf[np.where(y > np.quantile(y, quantile))] = i - _, _, y_train_clf, y_test_clf = train_test_split(X, y_clf, random_state=0) - logistic_model = LogisticRegression() - - importance_clf, list_loss_j, loss_reference = permutation_importance( - X_test, - y_test_clf, - estimator=logistic_model, - n_permutations=20, - method="predict_proba", - random_state=0, - n_jobs=1, - loss=log_loss, - groups=None, - ) - - assert importance_clf.shape == (X.shape[1],) - - -def test_with_fitting(linear_scenario): - X, y, beta = linear_scenario - important_features = np.where(beta != 0)[0] - non_important_features = np.where(beta == 0)[0] - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - - regression_model = LinearRegression() - regression_model.fit(X_train, y_train) - importance, list_loss_j, loss_reference = permutation_importance( - X_test, - y_test, - estimator=regression_model, - n_permutations=20, - method="predict", - random_state=0, - n_jobs=1, - ) - - assert importance.shape == (X.shape[1],) - assert ( - importance[important_features].mean() - > importance[non_important_features].mean() - ) diff --git a/test/test_permutation_importance_scikitlearn.py b/test/test_permutation_importance_scikitlearn.py deleted file mode 100644 index 03a02c1d..00000000 --- a/test/test_permutation_importance_scikitlearn.py +++ /dev/null @@ -1,75 +0,0 @@ -import numpy as np -from sklearn.linear_model import LinearRegression, LogisticRegression -from sklearn.model_selection import train_test_split - -from hidimstat.permutation_importance_scikitlearn import permutation_importance - - -def test_permutation_importance(linear_scenario): - X, y, beta = linear_scenario - important_features = np.where(beta != 0)[0] - non_important_features = np.where(beta == 0)[0] - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) - - regression_model = LinearRegression() - regression_model.fit(X_train, y_train) - vim = permutation_importance( - regression_model, - X_test, - y_test, - n_repeats=20, - scoring="r2", - random_state=0, - n_jobs=1, - ) - - importance = vim["importances_mean"] - - assert importance.shape == (X.shape[1],) - assert ( - importance[important_features].mean() - > importance[non_important_features].mean() - ) - - # impossible with groups - # # Same with groups and a pd.DataFrame - # groups = { - # "group_0": [f"col_{i}" for i in important_features], - # "the_group_1": [f"col_{i}" for i in non_important_features], - # } - # X_df = pd.DataFrame(X, columns=[f"col_{i}" for i in range(X.shape[1])]) - # X_train_df, X_test_df, y_train, y_test = train_test_split(X_df, y, random_state=0) - # regression_model = LinearRegression() - # regression_model.fit(X_train_df, y_train) - # vim = permutation_importance( - # regression_model, - # X_test_df, - # y_test, - # n_repeats=20, - # scoring='r2', - # random_state=0, - # n_jobs=1, - # groups=groups - # ) - # importance = vim['importances_mean'] - # - # assert importance[0].mean() > importance[1].mean() - - # Classification case - y_clf = np.where(y > np.median(y), 1, 0) - _, _, y_train_clf, y_test_clf = train_test_split(X, y_clf, random_state=0) - logistic_model = LogisticRegression() - logistic_model.fit(X_train, y_train_clf) - vim_clf = permutation_importance( - logistic_model, - X_test, - y_test_clf, - n_repeats=20, - scoring="neg_log_loss", - random_state=0, - n_jobs=1, - ) - importance_clf = vim_clf["importances_mean"] - - assert importance_clf.shape == (X.shape[1],)