From 18dfb0db7c17aa398779ce653a9dc9d7f7b7df62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Wed, 19 Jun 2024 19:09:35 +0000 Subject: [PATCH] Update knn shapely score computation (#1142) Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com> --- cleanlab/data_valuation.py | 53 +++- docs/source/tutorials/datalab/workflows.ipynb | 29 +- pyproject.toml | 2 +- tests/datalab/datalab/test_datalab.py | 298 +++++++++++++++++- tests/test_data_valuation.py | 59 +++- 5 files changed, 409 insertions(+), 32 deletions(-) diff --git a/cleanlab/data_valuation.py b/cleanlab/data_valuation.py index fcc00b9276..491f3d870d 100644 --- a/cleanlab/data_valuation.py +++ b/cleanlab/data_valuation.py @@ -27,20 +27,43 @@ from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index -def _knn_shapley_score(knn_graph: csr_matrix, labels: np.ndarray, k: int) -> np.ndarray: - """Compute the Shapley values of data points based on a knn graph.""" - N = labels.shape[0] +def _knn_shapley_score(neighbor_indices: np.ndarray, y: np.ndarray, k: int) -> np.ndarray: + """Compute the Data Shapley values of data points using neighbor indices in a K-Nearest Neighbors (KNN) graph. + + This function leverages equations (18) and (19) from the paper available at https://arxiv.org/abs/1908.08619 + for computational efficiency. + + Parameters + ---------- + neighbor_indices : + A 2D array where each row contains the indices of the k-nearest neighbors for each data point. + y : + A 1D array of target values corresponding to the data points. + k : + The number of nearest neighbors to consider for each data point. + + Notes + ----- + - The training set is used as its own test set for the KNN-Shapley value computation, meaning y_test is the same as y_train. + - `neighbor_indices` are assumed to be pre-sorted by distance, with the nearest neighbors appearing first, and with at least `k` neighbors. + - Unlike the referenced paper, this implementation does not account for an upper error bound epsilon. + Consequently, K* is treated as equal to K instead of K* = max(K, 1/epsilon). + - This simplification implies that the term min(K, j + 1) will always be j + 1, which is offset by the + corresponding denominator term in the inner loop. + - Dividing by K in the end achieves the same result as dividing by K* in the paper. + - The pre-allocated `scores` array incorporates equation (18) for j = k - 1, ensuring efficient computation. + """ + N = y.shape[0] scores = np.zeros((N, N)) - dist = knn_graph.indices.reshape(N, -1) - for y, s, dist_i in zip(labels, scores, dist): - idx = dist_i[::-1] - ans = labels[idx] - s[idx[k - 1]] = float(ans[k - 1] == y) - ans_matches = (ans == y).flatten() + for y_alpha, s_alpha, idx in zip(y, scores, neighbor_indices): + y_neighbors = y[idx] + ans_matches = (y_neighbors == y_alpha).flatten() for j in range(k - 2, -1, -1): - s[idx[j]] = s[idx[j + 1]] + float(int(ans_matches[j]) - int(ans_matches[j + 1])) - return 0.5 * (np.mean(scores / k, axis=0) + 1) + s_alpha[idx[j]] = s_alpha[idx[j + 1]] + float( + int(ans_matches[j]) - int(ans_matches[j + 1]) + ) + return np.mean(scores / k, axis=0) def data_shapley_knn( @@ -91,7 +114,7 @@ def data_shapley_knn( An array of transformed Data Shapley values for each data point, calibrated to indicate their relative importance. These scores have been adjusted to fall within 0 to 1. Values closer to 1 indicate data points that are highly influential and positively contribute to a trained ML model's performance. - Conversely, scores below 0.5 indicate data points estimated to negatively impact model performance. + Conversely, scores below 0.5 indicate data points estimated to negatively impact model performance. Raises ------ @@ -113,4 +136,8 @@ def data_shapley_knn( # Use provided knn_graph or compute it from features if knn_graph is None: knn_graph, _ = create_knn_graph_and_index(features, n_neighbors=k, metric=metric) - return _knn_shapley_score(knn_graph, labels, k) + + num_examples = labels.shape[0] + distances = knn_graph.indices.reshape(num_examples, -1) + scores = _knn_shapley_score(neighbor_indices=distances, y=labels, k=k) + return 0.5 * (scores + 1) diff --git a/docs/source/tutorials/datalab/workflows.ipynb b/docs/source/tutorials/datalab/workflows.ipynb index 43e7c0a941..9abe7458df 100644 --- a/docs/source/tutorials/datalab/workflows.ipynb +++ b/docs/source/tutorials/datalab/workflows.ipynb @@ -325,7 +325,11 @@ "metadata": {}, "source": [ "### 4. (Optional) Visualize Data Valuation Scores\n", - "Finally, we will visualize the data valuation scores using a histogram to understand the distribution of scores across different labels." + "Let's visualize the data valuation scores across our dataset.\n", + "\n", + "Cleanlab's Shapely scores are transformed to lie between 0 and 1 such that: a score below 0.5 indicates a negative contribution to the model's training performance, while a score above 0.5 indicates a positive contribution.\n", + "\n", + "By examining the scores across different classes, we can identify whether positive or negative contributions are disproportionately concentrated in a single class. This can help detect biases in the training data." ] }, { @@ -337,29 +341,28 @@ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", - "# Prepare the data for plotting a histogram\n", + "# Prepare the data for plotting\n", "plot_data = (\n", " data_valuation_issues\n", " # Optionally, add a 'given_label' column to distinguish between labels in the histogram\n", " .join(pd.DataFrame({\"given_label\": df_text[\"Label\"]}))\n", ")\n", "\n", - "# Plot histograms of data valuation scores for each label\n", - "sns.histplot(\n", + "# Plot strip plots of data valuation scores for each label\n", + "sns.stripplot(\n", " data=plot_data,\n", - " hue=\"given_label\", # Comment out if no labels should be used in the visualization\n", " x=\"data_valuation_score\",\n", - " bins=15,\n", - " element=\"step\",\n", - " multiple=\"stack\", # Stack histograms for different labels\n", + " hue=\"given_label\", # Comment out if no labels should be used in the visualization\n", + " dodge=True,\n", + " jitter=0.3,\n", + " alpha=0.5,\n", ")\n", "\n", - "# Set y-axis to a logarithmic scale for better visualization of wide-ranging counts\n", - "plt.yscale(\"log\")\n", - "plt.yscale(\"log\")\n", - "plt.title(\"Data Valuation Scores by Label\")\n", + "plt.axvline(lab.info[\"data_valuation\"][\"threshold\"], color=\"red\", linestyle=\"--\", label=\"Issue Threshold\")\n", + "\n", + "plt.title(\"Strip plot of Data Valuation Scores by Label\")\n", "plt.xlabel(\"Data Valuation Score\")\n", - "plt.ylabel(\"Count (log scale)\")\n", + "plt.legend()\n", "plt.show()" ] }, diff --git a/pyproject.toml b/pyproject.toml index 77b426158d..0b71a3e68b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ name = "cleanlab" # requirements files see: # https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/ dependencies = [ - "numpy>=1.22.0", + "numpy~=1.22", "scikit-learn>=1.1", "tqdm>=4.53.0", "pandas>=1.4.0", diff --git a/tests/datalab/datalab/test_datalab.py b/tests/datalab/datalab/test_datalab.py index f2aedd9dbb..5cd5e94219 100644 --- a/tests/datalab/datalab/test_datalab.py +++ b/tests/datalab/datalab/test_datalab.py @@ -28,6 +28,9 @@ import pytest from datasets.dataset_dict import DatasetDict from scipy.sparse import csr_matrix +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split from sklearn.neighbors import NearestNeighbors from sklearn.datasets import make_blobs @@ -1669,11 +1672,298 @@ def test_all_identical_dataset(self): assert data_valuation_issues["is_data_valuation_issue"].sum() == 0 # For a full knn-graph, all data points have the same value. Here, they all contribute the same value. - # The score of 54/99 is a value that works for 11 identical data points. - # TODO: Find a reasonable test for larger dataset, with k much smaller than N. Hard to guarantee a score of 0.5. - np.testing.assert_allclose( - data_valuation_issues["data_valuation_score"].to_numpy(), 54 / 99 + np.testing.assert_allclose(data_valuation_issues["data_valuation_score"].to_numpy(), 0.5) + + @pytest.mark.parametrize("N", [40, 100]) + @pytest.mark.parametrize("M", [2, 3, 4]) + def test_label_error_has_lower_data_valuation_score(self, N, M): + """Test that the for one point with a label error (in a binary classification task for a 2D blob dataset), its data valuation score is lower than the others.""" + np.random.seed(SEED) # Set seed for reproducibility + + X, y_true = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED) + + # Add label error to one data point + idx = np.random.choice(N, 1) + y_noisy = np.copy(y_true) + y_noisy[idx] = 1 - y_noisy[idx] + + # Run dataset through Datalab + lab = Datalab(data={"X": X, "y": y_noisy}, label_name="y") + lab.find_issues(features=X, issue_types={"data_valuation": {}}) + + data_valuation_issues = lab.get_issues("data_valuation") + + # At least one point has a data valuation issue (the one with the label error) + issue_ids = data_valuation_issues.query("is_data_valuation_issue").index.tolist() + num_issues = len(issue_ids) + assert num_issues == 1 + + # The scores should be lower for points with label errors + scores = data_valuation_issues["data_valuation_score"].to_numpy() + np.testing.assert_array_less( + scores[idx], 0.5 + ) # The point with the obvious label error should have a score lower than 0.5 + # But any other example flagged as an issue may have the same score or greater + if num_issues > 1: + assert np.all(scores[idx][0] <= scores[issue_ids]) and np.all(scores[issue_ids] < 0.5) + np.testing.assert_array_less( + scores[idx][0], np.delete(scores, issue_ids) + ) # All other points should have a higher score + + def test_outlier_has_lower_data_valuation_score(self): + """Test that for one point being an outlier (in a binary classification task for a 2D blob dataset), its data valuation score is lower than the others. + + TODO: Figure out when we can assert that an outlier gets a data valuation score of 0.0 (while not necessarily being a label error). + """ + np.random.seed(SEED) # Set seed for reproducibility + + N, M = 20, 10 + X, y_true = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED) + + # Turn one data point into an outlier + idx = np.random.choice(N, 1) + X_outlier = np.copy(X) + X_outlier[idx] *= -1 # Making the point an outlier, by flipping the sign of all features + + # Validate the outlier creation + assert not np.array_equal(X[idx], X_outlier[idx]), "Outlier creation failed." + + # Run dataset through Datalab + lab = Datalab(data={"X": X_outlier, "y": y_true}, label_name="y") + lab.find_issues(features=X_outlier, issue_types={"data_valuation": {}}) + + data_valuation_issues = lab.get_issues("data_valuation") + + # The scores should be lower for points with outliers + scores = data_valuation_issues["data_valuation_score"].to_numpy() + + # Validate scores are within an expected range (if applicable) + assert np.all(scores >= 0) and np.all( + scores <= 1 + ), "Scores are out of the expected range [0, 1]." + + # Check the outlier's score + assert np.all( + scores[idx] == 0.5 + ), "The outlier should have a score equal to 0.5." # Only this particular random seed guarantees this + np.testing.assert_array_less( + scores[idx][0], np.delete(scores, idx) + ) # All other points should have a higher score + + def test_duplicate_points_have_similar_scores(self): + """Test that duplicate points in the dataset have similar data valuation scores, which are higher compared to non-duplicate points.""" + np.random.seed(SEED) # Set seed for reproducibility + + N, M = 50, 5 + X, y = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED) + + # Introduce duplicate points + duplicate_indices = np.random.choice(N, 2, replace=False) + X[duplicate_indices[1]] = X[duplicate_indices[0]] + y[duplicate_indices[1]] = y[duplicate_indices[0]] + + # Run dataset through Datalab + lab = Datalab(data={"X": X, "y": y}, label_name="y") + lab.find_issues(features=X, issue_types={"data_valuation": {}}) + + data_valuation_issues = lab.get_issues("data_valuation") + + # The duplicate points don't have data valuation issues + assert data_valuation_issues["is_data_valuation_issue"][duplicate_indices].sum() == 0 + + # The scores for duplicate points should be similar + scores = data_valuation_issues["data_valuation_score"].to_numpy() + + duplicate_scores = scores[duplicate_indices] + non_duplicate_scores = np.delete(scores, duplicate_indices) + + # Check that duplicate points have identical scores + assert len(set(duplicate_scores)) == 1 + + # Check that duplicate points have higher scores than non-duplicate points + assert np.all(non_duplicate_scores <= duplicate_scores[0]) + + ### Now add label noise to one of the duplicates + + y[duplicate_indices[0]] = 1 - y[duplicate_indices[0]] + + # Run dataset through Datalab + lab = Datalab(data={"X": X, "y": y}, label_name="y") + lab.find_issues(features=X, issue_types={"data_valuation": {}}) + + data_valuation_issues = lab.get_issues("data_valuation") + is_issue = data_valuation_issues["is_data_valuation_issue"] + scores = data_valuation_issues["data_valuation_score"] + + # Only one of the duplicates points should have a data valuation issue + assert is_issue.sum() == 1 + assert is_issue[duplicate_indices].sum() == 1 + + # The scores should be as low as possible + assert scores[duplicate_indices[0]] == 0.491 + assert scores[duplicate_indices[1]] == 0.500 + + def add_label_noise(self, y, noise_level=0.1): + """Introduce random label noise to the labels.""" + np.random.seed(SEED) + n_samples = len(y) + n_noisy = int(noise_level * n_samples) + noisy_indices = np.random.choice(n_samples, n_noisy, replace=False) + y_noisy = np.copy(y) + y_noisy[noisy_indices] = 1 - y_noisy[noisy_indices] # Flip the labels + return y_noisy, noisy_indices + + @pytest.mark.parametrize("remove_percentage", [0.2, 0.3, 0.4, 0.5]) + def test_removing_low_valuation_points_improves_classification_accuracy_binary( + self, remove_percentage + ): + """Test that removing the bottom X% of data valuation scores improves ML performance compared to removing random points. + + NOTES + ----- + - This is applied to a binary classification task on a 2D dataset. + - This test does not consider filtering by the `is_data_valuation_issue` column before setting the threshold. + """ + np.random.seed(SEED) # Set seed for reproducibility + + N, M = 300, 5 + + X, y_true = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED) + + # Split data into training and testing sets + X_train, X_test, y_train_true, y_test = train_test_split( + X, y_true, test_size=0.3, random_state=SEED + ) + + # Introduce random label noise + noise_level = 0.4 + y_noisy, noisy_indices = self.add_label_noise(y_train_true, noise_level) + + # Run dataset through Datalab + lab = Datalab(data={"X": X_train, "y": y_noisy}, label_name="y") + lab.find_issues(features=X_train, issue_types={"data_valuation": {}}) + + data_valuation_issues = lab.get_issues("data_valuation") + scores = data_valuation_issues["data_valuation_score"].to_numpy() + + # Calculate the threshold for the bottom X% of scores + threshold = np.percentile(scores, remove_percentage * 100) + + # Identify the indices to remove based on data valuation scores + indices_to_remove_valuation = np.where(scores <= threshold)[0] + + # Create training set by removing the identified points + X_train_valuation_removed = np.delete(X_train, indices_to_remove_valuation, axis=0) + y_train_noisy_valuation_removed = np.delete(y_noisy, indices_to_remove_valuation, axis=0) + + # Train a model on the reduced dataset + clf_valuation = LogisticRegression(random_state=SEED) + clf_valuation.fit(X_train_valuation_removed, y_train_noisy_valuation_removed) + y_pred_valuation = clf_valuation.predict(X_test) + accuracy_valuation = accuracy_score(y_test, y_pred_valuation) + + # Randomly remove the same amount of data points + random_indices = np.random.choice( + len(X_train), len(indices_to_remove_valuation), replace=False + ) + X_train_random_removed = np.delete(X_train, random_indices, axis=0) + y_train_noisy_random_removed = np.delete(y_noisy, random_indices, axis=0) + + # Train a model on the randomly reduced dataset + clf_random = LogisticRegression(random_state=SEED) + clf_random.fit(X_train_random_removed, y_train_noisy_random_removed) + y_pred_random = clf_random.predict(X_test) + accuracy_random = accuracy_score(y_test, y_pred_random) + + # Assert that removing low valuation points leads to better performance + assert ( + accuracy_valuation > accuracy_random + ), f"Expected accuracy with valuation removal ({accuracy_valuation}) to be higher than random removal ({accuracy_random})" + if accuracy_valuation < 1.0: + assert (1 - accuracy_random) / ( + 1 - accuracy_valuation + ) >= 1.6, "Expected at least a 60% improvement in error rate after removing low valuation points" + + def add_multi_class_noise(self, y, noise_level=0.1): + """Introduce random label noise to the labels.""" + np.random.seed(SEED) + n_samples = len(y) + n_noisy = int(noise_level * n_samples) + noisy_indices = np.random.choice(n_samples, n_noisy, replace=False) + y_noisy = np.copy(y) + y_noisy[noisy_indices] = np.random.choice( + np.delete(np.unique(y), y[noisy_indices]), n_noisy ) + return y_noisy, noisy_indices + + @pytest.mark.parametrize("remove_percentage", [0.2, 0.3, 0.4, 0.5]) + def test_removing_low_valuation_points_improves_classification_accuracy_multi_class( + self, remove_percentage + ): + """Test that removing the bottom X% of data valuation scores improves ML performance compared to removing random points. + + NOTES + ----- + - This is applied to a multi-class classification task on a 2D dataset. + - This test does not consider filtering by the `is_data_valuation_issue` column before setting the threshold. + """ + np.random.seed(SEED) + + N, M = 300, 5 + X, y_true = make_blobs(n_samples=N, centers=4, n_features=M, random_state=SEED) + # Split data into training and testing sets + X_train, X_test, y_train_true, y_test = train_test_split( + X, y_true, test_size=0.3, random_state=SEED + ) + + # Introduce random label noise + noise_level = 0.3 + y_noisy, noisy_indices = self.add_label_noise(y_train_true, noise_level) + + # Run dataset through Datalab + lab = Datalab(data={"X": X_train, "y": y_noisy}, label_name="y") + lab.find_issues(features=X_train, issue_types={"data_valuation": {}}) + + data_valuation_issues = lab.get_issues("data_valuation") + scores = data_valuation_issues["data_valuation_score"].to_numpy() + + # Calculate the threshold for the bottom X% of scores + threshold = np.percentile(scores, remove_percentage * 100) + + # Identify the indices to remove based on data valuation scores + indices_to_remove_valuation = np.where(scores <= threshold)[0] + + # Create training set by removing the identified points + X_train_valuation_removed = np.delete(X_train, indices_to_remove_valuation, axis=0) + y_train_noisy_valuation_removed = np.delete(y_noisy, indices_to_remove_valuation, axis=0) + + # Train a model on the reduced dataset + clf_valuation = LogisticRegression(random_state=SEED) + clf_valuation.fit(X_train_valuation_removed, y_train_noisy_valuation_removed) + y_pred_valuation = clf_valuation.predict(X_test) + accuracy_valuation = accuracy_score(y_test, y_pred_valuation) + + # Randomly remove the same amount of data points + random_indices = np.random.choice( + len(X_train), len(indices_to_remove_valuation), replace=False + ) + X_train_random_removed = np.delete(X_train, random_indices, axis=0) + y_train_noisy_random_removed = np.delete(y_noisy, random_indices, axis=0) + + # Train a model on the randomly reduced dataset + clf_random = LogisticRegression(random_state=SEED) + clf_random.fit(X_train_random_removed, y_train_noisy_random_removed) + y_pred_random = clf_random.predict(X_test) + accuracy_random = accuracy_score(y_test, y_pred_random) + + # Assert that removing low valuation points leads to better performance + assert ( + accuracy_valuation > accuracy_random + ), f"Expected accuracy with valuation removal ({accuracy_valuation}) to be higher than random removal ({accuracy_random})" + if accuracy_valuation < 1.0: + assert (1 - accuracy_random) / ( + 1 - accuracy_valuation + ) >= 1.6, "Expected at least a 60% improvement in error rate after removing low valuation points" class TestIssueManagersReuseKnnGraph: diff --git a/tests/test_data_valuation.py b/tests/test_data_valuation.py index 390b740459..197ef923d7 100644 --- a/tests/test_data_valuation.py +++ b/tests/test_data_valuation.py @@ -16,10 +16,14 @@ import numpy as np import pytest +from hypothesis import given, settings, strategies as st +from hypothesis.strategies import composite +from hypothesis.extra.numpy import arrays from sklearn.neighbors import NearestNeighbors -from cleanlab.data_valuation import data_shapley_knn +from cleanlab.data_valuation import _knn_shapley_score, data_shapley_knn +from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index class TestDataValuation: @@ -52,3 +56,56 @@ def test_data_shapley_knn_with_knn_graph(self, labels, knn_graph): assert shapley.shape == (100,) assert np.all(shapley >= 0) assert np.all(shapley <= 1) + + +@composite +def valid_data(draw): + """ + A custom strategy to generate valid labels, features, and k such that: + - labels and features have the same length + - k is less than the length of labels and features + """ + # Generate a valid length for labels and features + length = draw(st.integers(min_value=11, max_value=1000)) + + # Generate labels and features of the same length + labels = draw( + arrays( + dtype=np.int32, + shape=length, + elements=st.integers(min_value=0, max_value=length - 1), + ) + ) + features = draw( + arrays( + dtype=np.float64, + shape=(length, draw(st.integers(min_value=2, max_value=50))), + elements=st.floats(min_value=-1.0, max_value=1.0), + ) + ) + + # Generate k such that it is less than the length of labels and features + k = draw(st.integers(min_value=1, max_value=length - 1)) + + return labels, features, k + + +class TestDataShapleyKNNScore: + """This test class prioritizes testing the raw/untransformed outputs of the _knn_shapley_score function.""" + + @settings( + max_examples=1000, deadline=None + ) # Increase the number of examples to test more cases + @given(valid_data()) + def test_knn_shapley_score_property(self, data): + labels, features, k = data + + knn_graph, _ = create_knn_graph_and_index(features, n_neighbors=k) + neighbor_indices = knn_graph.indices.reshape(-1, k) + + scores = _knn_shapley_score(neighbor_indices, labels, k) + + # Shapley scores should be between -1 and 1 + assert scores.shape == (len(labels),) + assert np.all(scores >= -1) + assert np.all(scores <= 1)