From 18dfb0db7c17aa398779ce653a9dc9d7f7b7df62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?El=C3=ADas=20Snorrason?= <eliassno@gmail.com>
Date: Wed, 19 Jun 2024 19:09:35 +0000
Subject: [PATCH] Update knn shapely score computation (#1142)

Co-authored-by: Jonas Mueller <1390638+jwmueller@users.noreply.github.com>
---
 cleanlab/data_valuation.py                    |  53 +++-
 docs/source/tutorials/datalab/workflows.ipynb |  29 +-
 pyproject.toml                                |   2 +-
 tests/datalab/datalab/test_datalab.py         | 298 +++++++++++++++++-
 tests/test_data_valuation.py                  |  59 +++-
 5 files changed, 409 insertions(+), 32 deletions(-)

diff --git a/cleanlab/data_valuation.py b/cleanlab/data_valuation.py
index fcc00b9276..491f3d870d 100644
--- a/cleanlab/data_valuation.py
+++ b/cleanlab/data_valuation.py
@@ -27,20 +27,43 @@
 from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index
 
 
-def _knn_shapley_score(knn_graph: csr_matrix, labels: np.ndarray, k: int) -> np.ndarray:
-    """Compute the Shapley values of data points based on a knn graph."""
-    N = labels.shape[0]
+def _knn_shapley_score(neighbor_indices: np.ndarray, y: np.ndarray, k: int) -> np.ndarray:
+    """Compute the Data Shapley values of data points using neighbor indices in a K-Nearest Neighbors (KNN) graph.
+
+    This function leverages equations (18) and (19) from the paper available at https://arxiv.org/abs/1908.08619
+    for computational efficiency.
+
+    Parameters
+    ----------
+    neighbor_indices :
+        A 2D array where each row contains the indices of the k-nearest neighbors for each data point.
+    y :
+        A 1D array of target values corresponding to the data points.
+    k :
+        The number of nearest neighbors to consider for each data point.
+
+    Notes
+    -----
+    - The training set is used as its own test set for the KNN-Shapley value computation, meaning y_test is the same as y_train.
+    - `neighbor_indices` are assumed to be pre-sorted by distance, with the nearest neighbors appearing first, and with at least `k` neighbors.
+    - Unlike the referenced paper, this implementation does not account for an upper error bound epsilon.
+      Consequently, K* is treated as equal to K instead of K* = max(K, 1/epsilon).
+        - This simplification implies that the term min(K, j + 1) will always be j + 1, which is offset by the
+          corresponding denominator term in the inner loop.
+        - Dividing by K in the end achieves the same result as dividing by K* in the paper.
+    - The pre-allocated `scores` array incorporates equation (18) for j = k - 1, ensuring efficient computation.
+    """
+    N = y.shape[0]
     scores = np.zeros((N, N))
-    dist = knn_graph.indices.reshape(N, -1)
 
-    for y, s, dist_i in zip(labels, scores, dist):
-        idx = dist_i[::-1]
-        ans = labels[idx]
-        s[idx[k - 1]] = float(ans[k - 1] == y)
-        ans_matches = (ans == y).flatten()
+    for y_alpha, s_alpha, idx in zip(y, scores, neighbor_indices):
+        y_neighbors = y[idx]
+        ans_matches = (y_neighbors == y_alpha).flatten()
         for j in range(k - 2, -1, -1):
-            s[idx[j]] = s[idx[j + 1]] + float(int(ans_matches[j]) - int(ans_matches[j + 1]))
-    return 0.5 * (np.mean(scores / k, axis=0) + 1)
+            s_alpha[idx[j]] = s_alpha[idx[j + 1]] + float(
+                int(ans_matches[j]) - int(ans_matches[j + 1])
+            )
+    return np.mean(scores / k, axis=0)
 
 
 def data_shapley_knn(
@@ -91,7 +114,7 @@ def data_shapley_knn(
         An array of transformed Data Shapley values for each data point, calibrated to indicate their relative importance.
         These scores have been adjusted to fall within 0 to 1.
         Values closer to 1 indicate data points that are highly influential and positively contribute to a trained ML model's performance.
-        Conversely, scores below 0.5 indicate data points estimated to  negatively impact model performance.
+        Conversely, scores below 0.5 indicate data points estimated to negatively impact model performance.
 
     Raises
     ------
@@ -113,4 +136,8 @@ def data_shapley_knn(
     # Use provided knn_graph or compute it from features
     if knn_graph is None:
         knn_graph, _ = create_knn_graph_and_index(features, n_neighbors=k, metric=metric)
-    return _knn_shapley_score(knn_graph, labels, k)
+
+    num_examples = labels.shape[0]
+    distances = knn_graph.indices.reshape(num_examples, -1)
+    scores = _knn_shapley_score(neighbor_indices=distances, y=labels, k=k)
+    return 0.5 * (scores + 1)
diff --git a/docs/source/tutorials/datalab/workflows.ipynb b/docs/source/tutorials/datalab/workflows.ipynb
index 43e7c0a941..9abe7458df 100644
--- a/docs/source/tutorials/datalab/workflows.ipynb
+++ b/docs/source/tutorials/datalab/workflows.ipynb
@@ -325,7 +325,11 @@
    "metadata": {},
    "source": [
     "### 4. (Optional) Visualize Data Valuation Scores\n",
-    "Finally, we will visualize the data valuation scores using a histogram to understand the distribution of scores across different labels."
+    "Let's visualize the data valuation scores across our dataset.\n",
+    "\n",
+    "Cleanlab's Shapely scores are transformed to lie between 0 and 1 such that: a score below 0.5 indicates a negative contribution to the model's training performance, while a score above 0.5 indicates a positive contribution.\n",
+    "\n",
+    "By examining the scores across different classes, we can identify whether positive or negative contributions are disproportionately concentrated in a single class. This can help detect biases in the training data."
    ]
   },
   {
@@ -337,29 +341,28 @@
     "import seaborn as sns\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "# Prepare the data for plotting a histogram\n",
+    "# Prepare the data for plotting\n",
     "plot_data = (\n",
     "    data_valuation_issues\n",
     "    # Optionally, add a 'given_label' column to distinguish between labels in the histogram\n",
     "    .join(pd.DataFrame({\"given_label\": df_text[\"Label\"]}))\n",
     ")\n",
     "\n",
-    "# Plot histograms of data valuation scores for each label\n",
-    "sns.histplot(\n",
+    "# Plot strip plots of data valuation scores for each label\n",
+    "sns.stripplot(\n",
     "    data=plot_data,\n",
-    "    hue=\"given_label\",  # Comment out if no labels should be used in the visualization\n",
     "    x=\"data_valuation_score\",\n",
-    "    bins=15,\n",
-    "    element=\"step\",\n",
-    "    multiple=\"stack\",  # Stack histograms for different labels\n",
+    "    hue=\"given_label\",  # Comment out if no labels should be used in the visualization\n",
+    "    dodge=True,\n",
+    "    jitter=0.3,\n",
+    "    alpha=0.5,\n",
     ")\n",
     "\n",
-    "# Set y-axis to a logarithmic scale for better visualization of wide-ranging counts\n",
-    "plt.yscale(\"log\")\n",
-    "plt.yscale(\"log\")\n",
-    "plt.title(\"Data Valuation Scores by Label\")\n",
+    "plt.axvline(lab.info[\"data_valuation\"][\"threshold\"], color=\"red\", linestyle=\"--\", label=\"Issue Threshold\")\n",
+    "\n",
+    "plt.title(\"Strip plot of Data Valuation Scores by Label\")\n",
     "plt.xlabel(\"Data Valuation Score\")\n",
-    "plt.ylabel(\"Count (log scale)\")\n",
+    "plt.legend()\n",
     "plt.show()"
    ]
   },
diff --git a/pyproject.toml b/pyproject.toml
index 77b426158d..0b71a3e68b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,7 @@ name = "cleanlab"
 # requirements files see:
 # https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/
 dependencies = [
-  "numpy>=1.22.0",
+  "numpy~=1.22",
   "scikit-learn>=1.1",
   "tqdm>=4.53.0",
   "pandas>=1.4.0",
diff --git a/tests/datalab/datalab/test_datalab.py b/tests/datalab/datalab/test_datalab.py
index f2aedd9dbb..5cd5e94219 100644
--- a/tests/datalab/datalab/test_datalab.py
+++ b/tests/datalab/datalab/test_datalab.py
@@ -28,6 +28,9 @@
 import pytest
 from datasets.dataset_dict import DatasetDict
 from scipy.sparse import csr_matrix
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
 from sklearn.neighbors import NearestNeighbors
 from sklearn.datasets import make_blobs
 
@@ -1669,11 +1672,298 @@ def test_all_identical_dataset(self):
         assert data_valuation_issues["is_data_valuation_issue"].sum() == 0
 
         # For a full knn-graph, all data points have the same value. Here, they all contribute the same value.
-        # The score of 54/99 is a value that works for 11 identical data points.
-        # TODO: Find a reasonable test for larger dataset, with k much smaller than N. Hard to guarantee a score of 0.5.
-        np.testing.assert_allclose(
-            data_valuation_issues["data_valuation_score"].to_numpy(), 54 / 99
+        np.testing.assert_allclose(data_valuation_issues["data_valuation_score"].to_numpy(), 0.5)
+
+    @pytest.mark.parametrize("N", [40, 100])
+    @pytest.mark.parametrize("M", [2, 3, 4])
+    def test_label_error_has_lower_data_valuation_score(self, N, M):
+        """Test that the for one point with a label error (in a binary classification task for a 2D blob dataset), its data valuation score is lower than the others."""
+        np.random.seed(SEED)  # Set seed for reproducibility
+
+        X, y_true = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED)
+
+        # Add label error to one data point
+        idx = np.random.choice(N, 1)
+        y_noisy = np.copy(y_true)
+        y_noisy[idx] = 1 - y_noisy[idx]
+
+        # Run dataset through Datalab
+        lab = Datalab(data={"X": X, "y": y_noisy}, label_name="y")
+        lab.find_issues(features=X, issue_types={"data_valuation": {}})
+
+        data_valuation_issues = lab.get_issues("data_valuation")
+
+        # At least one point has a data valuation issue (the one with the label error)
+        issue_ids = data_valuation_issues.query("is_data_valuation_issue").index.tolist()
+        num_issues = len(issue_ids)
+        assert num_issues == 1
+
+        # The scores should be lower for points with label errors
+        scores = data_valuation_issues["data_valuation_score"].to_numpy()
+        np.testing.assert_array_less(
+            scores[idx], 0.5
+        )  # The point with the obvious label error should have a score lower than 0.5
+        # But any other example flagged as an issue may have the same score or greater
+        if num_issues > 1:
+            assert np.all(scores[idx][0] <= scores[issue_ids]) and np.all(scores[issue_ids] < 0.5)
+        np.testing.assert_array_less(
+            scores[idx][0], np.delete(scores, issue_ids)
+        )  # All other points should have a higher score
+
+    def test_outlier_has_lower_data_valuation_score(self):
+        """Test that for one point being an outlier (in a binary classification task for a 2D blob dataset), its data valuation score is lower than the others.
+
+        TODO: Figure out when we can assert that an outlier gets a data valuation score of 0.0 (while not necessarily being a label error).
+        """
+        np.random.seed(SEED)  # Set seed for reproducibility
+
+        N, M = 20, 10
+        X, y_true = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED)
+
+        # Turn one data point into an outlier
+        idx = np.random.choice(N, 1)
+        X_outlier = np.copy(X)
+        X_outlier[idx] *= -1  # Making the point an outlier, by flipping the sign of all features
+
+        # Validate the outlier creation
+        assert not np.array_equal(X[idx], X_outlier[idx]), "Outlier creation failed."
+
+        # Run dataset through Datalab
+        lab = Datalab(data={"X": X_outlier, "y": y_true}, label_name="y")
+        lab.find_issues(features=X_outlier, issue_types={"data_valuation": {}})
+
+        data_valuation_issues = lab.get_issues("data_valuation")
+
+        # The scores should be lower for points with outliers
+        scores = data_valuation_issues["data_valuation_score"].to_numpy()
+
+        # Validate scores are within an expected range (if applicable)
+        assert np.all(scores >= 0) and np.all(
+            scores <= 1
+        ), "Scores are out of the expected range [0, 1]."
+
+        # Check the outlier's score
+        assert np.all(
+            scores[idx] == 0.5
+        ), "The outlier should have a score equal to 0.5."  # Only this particular random seed guarantees this
+        np.testing.assert_array_less(
+            scores[idx][0], np.delete(scores, idx)
+        )  # All other points should have a higher score
+
+    def test_duplicate_points_have_similar_scores(self):
+        """Test that duplicate points in the dataset have similar data valuation scores, which are higher compared to non-duplicate points."""
+        np.random.seed(SEED)  # Set seed for reproducibility
+
+        N, M = 50, 5
+        X, y = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED)
+
+        # Introduce duplicate points
+        duplicate_indices = np.random.choice(N, 2, replace=False)
+        X[duplicate_indices[1]] = X[duplicate_indices[0]]
+        y[duplicate_indices[1]] = y[duplicate_indices[0]]
+
+        # Run dataset through Datalab
+        lab = Datalab(data={"X": X, "y": y}, label_name="y")
+        lab.find_issues(features=X, issue_types={"data_valuation": {}})
+
+        data_valuation_issues = lab.get_issues("data_valuation")
+
+        # The duplicate points don't have data valuation issues
+        assert data_valuation_issues["is_data_valuation_issue"][duplicate_indices].sum() == 0
+
+        # The scores for duplicate points should be similar
+        scores = data_valuation_issues["data_valuation_score"].to_numpy()
+
+        duplicate_scores = scores[duplicate_indices]
+        non_duplicate_scores = np.delete(scores, duplicate_indices)
+
+        # Check that duplicate points have identical scores
+        assert len(set(duplicate_scores)) == 1
+
+        # Check that duplicate points have higher scores than non-duplicate points
+        assert np.all(non_duplicate_scores <= duplicate_scores[0])
+
+        ### Now add label noise to one of the duplicates
+
+        y[duplicate_indices[0]] = 1 - y[duplicate_indices[0]]
+
+        # Run dataset through Datalab
+        lab = Datalab(data={"X": X, "y": y}, label_name="y")
+        lab.find_issues(features=X, issue_types={"data_valuation": {}})
+
+        data_valuation_issues = lab.get_issues("data_valuation")
+        is_issue = data_valuation_issues["is_data_valuation_issue"]
+        scores = data_valuation_issues["data_valuation_score"]
+
+        # Only one of the duplicates points should have a data valuation issue
+        assert is_issue.sum() == 1
+        assert is_issue[duplicate_indices].sum() == 1
+
+        # The scores should be as low as possible
+        assert scores[duplicate_indices[0]] == 0.491
+        assert scores[duplicate_indices[1]] == 0.500
+
+    def add_label_noise(self, y, noise_level=0.1):
+        """Introduce random label noise to the labels."""
+        np.random.seed(SEED)
+        n_samples = len(y)
+        n_noisy = int(noise_level * n_samples)
+        noisy_indices = np.random.choice(n_samples, n_noisy, replace=False)
+        y_noisy = np.copy(y)
+        y_noisy[noisy_indices] = 1 - y_noisy[noisy_indices]  # Flip the labels
+        return y_noisy, noisy_indices
+
+    @pytest.mark.parametrize("remove_percentage", [0.2, 0.3, 0.4, 0.5])
+    def test_removing_low_valuation_points_improves_classification_accuracy_binary(
+        self, remove_percentage
+    ):
+        """Test that removing the bottom X% of data valuation scores improves ML performance compared to removing random points.
+
+        NOTES
+        -----
+        - This is applied to a binary classification task on a 2D dataset.
+        - This test does not consider filtering by the `is_data_valuation_issue` column before setting the threshold.
+        """
+        np.random.seed(SEED)  # Set seed for reproducibility
+
+        N, M = 300, 5
+
+        X, y_true = make_blobs(n_samples=N, centers=2, n_features=M, random_state=SEED)
+
+        # Split data into training and testing sets
+        X_train, X_test, y_train_true, y_test = train_test_split(
+            X, y_true, test_size=0.3, random_state=SEED
+        )
+
+        # Introduce random label noise
+        noise_level = 0.4
+        y_noisy, noisy_indices = self.add_label_noise(y_train_true, noise_level)
+
+        # Run dataset through Datalab
+        lab = Datalab(data={"X": X_train, "y": y_noisy}, label_name="y")
+        lab.find_issues(features=X_train, issue_types={"data_valuation": {}})
+
+        data_valuation_issues = lab.get_issues("data_valuation")
+        scores = data_valuation_issues["data_valuation_score"].to_numpy()
+
+        # Calculate the threshold for the bottom X% of scores
+        threshold = np.percentile(scores, remove_percentage * 100)
+
+        # Identify the indices to remove based on data valuation scores
+        indices_to_remove_valuation = np.where(scores <= threshold)[0]
+
+        # Create training set by removing the identified points
+        X_train_valuation_removed = np.delete(X_train, indices_to_remove_valuation, axis=0)
+        y_train_noisy_valuation_removed = np.delete(y_noisy, indices_to_remove_valuation, axis=0)
+
+        # Train a model on the reduced dataset
+        clf_valuation = LogisticRegression(random_state=SEED)
+        clf_valuation.fit(X_train_valuation_removed, y_train_noisy_valuation_removed)
+        y_pred_valuation = clf_valuation.predict(X_test)
+        accuracy_valuation = accuracy_score(y_test, y_pred_valuation)
+
+        # Randomly remove the same amount of data points
+        random_indices = np.random.choice(
+            len(X_train), len(indices_to_remove_valuation), replace=False
+        )
+        X_train_random_removed = np.delete(X_train, random_indices, axis=0)
+        y_train_noisy_random_removed = np.delete(y_noisy, random_indices, axis=0)
+
+        # Train a model on the randomly reduced dataset
+        clf_random = LogisticRegression(random_state=SEED)
+        clf_random.fit(X_train_random_removed, y_train_noisy_random_removed)
+        y_pred_random = clf_random.predict(X_test)
+        accuracy_random = accuracy_score(y_test, y_pred_random)
+
+        # Assert that removing low valuation points leads to better performance
+        assert (
+            accuracy_valuation > accuracy_random
+        ), f"Expected accuracy with valuation removal ({accuracy_valuation}) to be higher than random removal ({accuracy_random})"
+        if accuracy_valuation < 1.0:
+            assert (1 - accuracy_random) / (
+                1 - accuracy_valuation
+            ) >= 1.6, "Expected at least a 60% improvement in error rate after removing low valuation points"
+
+    def add_multi_class_noise(self, y, noise_level=0.1):
+        """Introduce random label noise to the labels."""
+        np.random.seed(SEED)
+        n_samples = len(y)
+        n_noisy = int(noise_level * n_samples)
+        noisy_indices = np.random.choice(n_samples, n_noisy, replace=False)
+        y_noisy = np.copy(y)
+        y_noisy[noisy_indices] = np.random.choice(
+            np.delete(np.unique(y), y[noisy_indices]), n_noisy
         )
+        return y_noisy, noisy_indices
+
+    @pytest.mark.parametrize("remove_percentage", [0.2, 0.3, 0.4, 0.5])
+    def test_removing_low_valuation_points_improves_classification_accuracy_multi_class(
+        self, remove_percentage
+    ):
+        """Test that removing the bottom X% of data valuation scores improves ML performance compared to removing random points.
+
+        NOTES
+        -----
+        - This is applied to a multi-class classification task on a 2D dataset.
+        - This test does not consider filtering by the `is_data_valuation_issue` column before setting the threshold.
+        """
+        np.random.seed(SEED)
+
+        N, M = 300, 5
+        X, y_true = make_blobs(n_samples=N, centers=4, n_features=M, random_state=SEED)
+        # Split data into training and testing sets
+        X_train, X_test, y_train_true, y_test = train_test_split(
+            X, y_true, test_size=0.3, random_state=SEED
+        )
+
+        # Introduce random label noise
+        noise_level = 0.3
+        y_noisy, noisy_indices = self.add_label_noise(y_train_true, noise_level)
+
+        # Run dataset through Datalab
+        lab = Datalab(data={"X": X_train, "y": y_noisy}, label_name="y")
+        lab.find_issues(features=X_train, issue_types={"data_valuation": {}})
+
+        data_valuation_issues = lab.get_issues("data_valuation")
+        scores = data_valuation_issues["data_valuation_score"].to_numpy()
+
+        # Calculate the threshold for the bottom X% of scores
+        threshold = np.percentile(scores, remove_percentage * 100)
+
+        # Identify the indices to remove based on data valuation scores
+        indices_to_remove_valuation = np.where(scores <= threshold)[0]
+
+        # Create training set by removing the identified points
+        X_train_valuation_removed = np.delete(X_train, indices_to_remove_valuation, axis=0)
+        y_train_noisy_valuation_removed = np.delete(y_noisy, indices_to_remove_valuation, axis=0)
+
+        # Train a model on the reduced dataset
+        clf_valuation = LogisticRegression(random_state=SEED)
+        clf_valuation.fit(X_train_valuation_removed, y_train_noisy_valuation_removed)
+        y_pred_valuation = clf_valuation.predict(X_test)
+        accuracy_valuation = accuracy_score(y_test, y_pred_valuation)
+
+        # Randomly remove the same amount of data points
+        random_indices = np.random.choice(
+            len(X_train), len(indices_to_remove_valuation), replace=False
+        )
+        X_train_random_removed = np.delete(X_train, random_indices, axis=0)
+        y_train_noisy_random_removed = np.delete(y_noisy, random_indices, axis=0)
+
+        # Train a model on the randomly reduced dataset
+        clf_random = LogisticRegression(random_state=SEED)
+        clf_random.fit(X_train_random_removed, y_train_noisy_random_removed)
+        y_pred_random = clf_random.predict(X_test)
+        accuracy_random = accuracy_score(y_test, y_pred_random)
+
+        # Assert that removing low valuation points leads to better performance
+        assert (
+            accuracy_valuation > accuracy_random
+        ), f"Expected accuracy with valuation removal ({accuracy_valuation}) to be higher than random removal ({accuracy_random})"
+        if accuracy_valuation < 1.0:
+            assert (1 - accuracy_random) / (
+                1 - accuracy_valuation
+            ) >= 1.6, "Expected at least a 60% improvement in error rate after removing low valuation points"
 
 
 class TestIssueManagersReuseKnnGraph:
diff --git a/tests/test_data_valuation.py b/tests/test_data_valuation.py
index 390b740459..197ef923d7 100644
--- a/tests/test_data_valuation.py
+++ b/tests/test_data_valuation.py
@@ -16,10 +16,14 @@
 
 import numpy as np
 import pytest
+from hypothesis import given, settings, strategies as st
+from hypothesis.strategies import composite
+from hypothesis.extra.numpy import arrays
 
 from sklearn.neighbors import NearestNeighbors
 
-from cleanlab.data_valuation import data_shapley_knn
+from cleanlab.data_valuation import _knn_shapley_score, data_shapley_knn
+from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index
 
 
 class TestDataValuation:
@@ -52,3 +56,56 @@ def test_data_shapley_knn_with_knn_graph(self, labels, knn_graph):
         assert shapley.shape == (100,)
         assert np.all(shapley >= 0)
         assert np.all(shapley <= 1)
+
+
+@composite
+def valid_data(draw):
+    """
+    A custom strategy to generate valid labels, features, and k such that:
+    - labels and features have the same length
+    - k is less than the length of labels and features
+    """
+    # Generate a valid length for labels and features
+    length = draw(st.integers(min_value=11, max_value=1000))
+
+    # Generate labels and features of the same length
+    labels = draw(
+        arrays(
+            dtype=np.int32,
+            shape=length,
+            elements=st.integers(min_value=0, max_value=length - 1),
+        )
+    )
+    features = draw(
+        arrays(
+            dtype=np.float64,
+            shape=(length, draw(st.integers(min_value=2, max_value=50))),
+            elements=st.floats(min_value=-1.0, max_value=1.0),
+        )
+    )
+
+    # Generate k such that it is less than the length of labels and features
+    k = draw(st.integers(min_value=1, max_value=length - 1))
+
+    return labels, features, k
+
+
+class TestDataShapleyKNNScore:
+    """This test class prioritizes testing the raw/untransformed outputs of the _knn_shapley_score function."""
+
+    @settings(
+        max_examples=1000, deadline=None
+    )  # Increase the number of examples to test more cases
+    @given(valid_data())
+    def test_knn_shapley_score_property(self, data):
+        labels, features, k = data
+
+        knn_graph, _ = create_knn_graph_and_index(features, n_neighbors=k)
+        neighbor_indices = knn_graph.indices.reshape(-1, k)
+
+        scores = _knn_shapley_score(neighbor_indices, labels, k)
+
+        # Shapley scores should be between -1 and 1
+        assert scores.shape == (len(labels),)
+        assert np.all(scores >= -1)
+        assert np.all(scores <= 1)