diff --git a/.github/workflows/tests_workflow.yml b/.github/workflows/tests_workflow.yml
new file mode 100644
index 0000000..d0862b3
--- /dev/null
+++ b/.github/workflows/tests_workflow.yml
@@ -0,0 +1,27 @@
+name: tests_workflow
+ 
+# execute this workflow automatically when a we push to any branch
+on: [push]
+ 
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          pip install pytest
+      - name: Install PARC
+        run: |
+          pip install .
+      - name: Run Python tests -x
+        run: |
+          pytest tests/
+        continue-on-error: false
+
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 8ff9a92..4fc9a7e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 env/
 */__pycache__
 *.egg-info
+.eggs/
+.DS_Store
diff --git a/README.md b/README.md
index bd0a16b..6ba0d19 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ plt.scatter(X[:, 0], X[:, 1], c=parc_labels, cmap='rainbow')
 plt.show()
 
 # Run umap on the HNSW knngraph already built in PARC (more time and memory efficient for large datasets)
-// Parc1.knn_struct = p1.make_knn_struct() // if you choose to visualize before running PARC clustering. then you need to include this line
+// Parc1.hnsw_index = p1.create_hnsw_index() // if you choose to visualize before running PARC clustering. then you need to include this line
 graph = Parc1.knngraph_full()
 X_umap = Parc1.run_umap_hnsw(X, graph)
 plt.scatter(X_umap[:, 0], X_umap[:, 1], c=Parc1.labels)
diff --git a/docs/Examples.rst b/docs/Examples.rst
index a599521..d6bfe8e 100644
--- a/docs/Examples.rst
+++ b/docs/Examples.rst
@@ -31,7 +31,7 @@ Examples
 	plt.show()
 
 	# Run umap on the HNSW knngraph already built in PARC (more time and memory efficient for large datasets)
-	# If you choose to visualize before running PARC clustering. then you need to include this line: Parc1.knn_struct = p1.make_knn_struct() 
+	# If you choose to visualize before running PARC clustering. then you need to include this line: Parc1.hnsw_index = p1.create_hnsw_index() 
 	graph = Parc1.knngraph_full()
 	X_umap = Parc1.run_umap_hnsw(X, graph)
 	plt.scatter(X_umap[:, 0], X_umap[:, 1], c=Parc1.labels)
diff --git a/parc/_parc.py b/parc/_parc.py
index b38f468..d8c8457 100644
--- a/parc/_parc.py
+++ b/parc/_parc.py
@@ -1,524 +1,965 @@
 import numpy as np
 import pandas as pd
 import hnswlib
+import os
+import pathlib
+import json
 from scipy.sparse import csr_matrix
 import igraph as ig
 import leidenalg
 import time
 from umap.umap_ import find_ab_params, simplicial_set_embedding
+from parc.utils import get_mode
+from parc.logger import get_logger
+
+
+logger = get_logger(__name__)
 
 
-#latest github upload 27-June-2020
 class PARC:
-    def __init__(self, data, true_label=None, dist_std_local=3, jac_std_global='median', keep_all_local_dist='auto',
-                 too_big_factor=0.4, small_pop=10, jac_weighted_edges=True, knn=30, n_iter_leiden=5, random_seed=42,
-                 num_threads=-1, distance='l2', time_smallpop=15, partition_type = "ModularityVP", resolution_parameter = 1.0,
-                 knn_struct=None, neighbor_graph=None, hnsw_param_ef_construction = 150):
-        # higher dist_std_local means more edges are kept
-        # highter jac_std_global means more edges are kept
-        if keep_all_local_dist == 'auto':
-            if data.shape[0] > 300000:
-                keep_all_local_dist = True  # skips local pruning to increase speed
+    """``PARC``: ``P``henotyping by ``A``ccelerated ``R``efined ``C``ommunity-partitioning.
+
+    Attributes:
+        knn:
+            The number of nearest neighbors k for the k-nearest neighbours algorithm.
+            Larger k means more neighbors in a cluster and therefore less clusters.
+        n_iter_leiden:
+            The number of iterations for the Leiden algorithm.
+        random_seed:
+            The random seed to enable reproducible Leiden clustering.
+        distance_metric:
+            The distance metric to be used in the KNN algorithm:
+
+                * ``l2``: Euclidean distance L^2 norm:
+
+                  .. code-block:: python
+
+                    d = np.sum((x_i - y_i)**2)
+                * ``cosine``: cosine similarity:
+
+                  .. code-block:: python
+
+                    d = 1.0 - np.sum(x_i*y_i) / np.sqrt(sum(x_i*x_i) * np.sum(y_i*y_i))
+                * ``ip``: inner product distance:
+
+                  .. code-block:: python
+
+                    d = 1.0 - np.sum(x_i*y_i)
+        n_threads:
+            The number of threads used in the KNN algorithm.
+        hnsw_param_ef_construction:
+            A higher value increases accuracy of index construction.
+            Even for O(100 000) cells, 150-200 is adequate.
+        neighbor_graph:
+            A sparse matrix with dimensions ``(n_samples, n_samples)``, containing the
+            distances between nodes.
+        hnsw_index:
+            The HNSW index of the KNN graph on which we perform queries.
+        l2_std_factor:
+            The multiplier used in calculating the Euclidean distance threshold for the distance
+            between two nodes during local pruning:
+
+            .. code-block:: python
+
+                max_distance = np.mean(distances) + l2_std_factor * np.std(distances)
+
+            Avoid setting both the ``jac_std_factor`` (global) and the ``l2_std_factor`` (local)
+            to < 0.5 as this is very aggressive pruning.
+            Higher ``l2_std_factor`` means more edges are kept.
+        do_prune_local:
+            Whether or not to do local pruning. If ``None`` (default), set to ``False`` if the
+            number of samples is > 300 000, and set to ``True`` otherwise.
+        jac_threshold_type:
+            One of ``"median"`` or ``"mean"``. Determines how the Jaccard similarity threshold is
+            calculated during global pruning.
+        jac_std_factor:
+            The multiplier used in calculating the Jaccard similarity threshold for the similarity
+            between two nodes during global pruning for ``jac_threshold_type = "mean"``:
+
+            .. code-block:: python
+
+                threshold = np.mean(similarities) - jac_std_factor * np.std(similarities)
+
+            Setting ``jac_std_factor = 0.15`` and ``jac_threshold_type="mean"`` performs empirically
+            similar to ``jac_threshold_type="median"``, which does not use the ``jac_std_factor``.
+            Generally values between 0-1.5 are reasonable. Higher ``jac_std_factor`` means more
+            edges are kept.
+        jac_weighted_edges:
+            Whether to partition using the weighted graph.
+        resolution_parameter:
+            The resolution parameter to be used in the Leiden algorithm.
+            In order to change ``resolution_parameter``, we switch to ``RBVP``.
+        partition_type:
+            The partition type to be used in the Leiden algorithm:
+
+            * ``ModularityVP``: ModularityVertexPartition, ``resolution_parameter=1``
+            * ``RBVP``: RBConfigurationVP, Reichardt and Bornholdt’s Potts model. Note that this
+                is the same as ``ModularityVP`` when setting 𝛾 = 1 and normalising by 2m.
+        large_community_factor:
+            A factor used to determine if a community is too large.
+            If the community size is greater than ``large_community_factor * n_samples``,
+            then the community is too large and the ``PARC`` algorithm will be run on the single
+            community to split it up. The default value of ``0.4`` ensures that all communities
+            will be less than the cutoff size.
+        small_community_size:
+            The smallest population size to be considered a community.
+        small_community_timeout:
+            The maximum number of seconds trying to check an outlying small community.
+    """
+    def __init__(
+        self,
+        x_data: np.ndarray | pd.DataFrame,
+        y_data_true: np.ndarray | pd.Series | list[int] | None = None,
+        file_path: pathlib.Path | str | None = None,
+        knn: int = 30,
+        n_iter_leiden: int = 5,
+        random_seed: int = 42,
+        distance_metric: str = "l2",
+        n_threads: int = -1,
+        hnsw_param_ef_construction: int = 150,
+        neighbor_graph: csr_matrix | None = None,
+        hnsw_index: hnswlib.Index | None = None,
+        l2_std_factor: float = 3,
+        jac_threshold_type: str = "median",
+        jac_std_factor: float = 0.15,
+        jac_weighted_edges: bool = True,
+        do_prune_local: bool | None = None,
+        large_community_factor: float = 0.4,
+        small_community_size: int = 10,
+        small_community_timeout: float = 15,
+        resolution_parameter: float = 1.0,
+        partition_type: str = "ModularityVP"
+    ):
+        self.x_data = x_data
+        self.y_data_true = y_data_true
+        self.y_data_pred = None
+
+        if file_path is not None:
+            self.load(file_path)
+        else:
+            self.knn = knn
+            self.n_iter_leiden = n_iter_leiden
+            self.random_seed = random_seed
+            self.distance_metric = distance_metric
+            self.n_threads = n_threads
+            self.hnsw_param_ef_construction = hnsw_param_ef_construction
+            self.neighbor_graph = neighbor_graph
+            self.hnsw_index = hnsw_index
+            self.l2_std_factor = l2_std_factor
+            self.jac_threshold_type = jac_threshold_type
+            self.jac_std_factor = jac_std_factor
+            self.jac_weighted_edges = jac_weighted_edges
+            self.do_prune_local = do_prune_local
+            self.large_community_factor = large_community_factor
+            self.small_community_size = small_community_size
+            self.small_community_timeout = small_community_timeout
+            self.resolution_parameter = resolution_parameter
+            self.partition_type = partition_type
+
+    @property
+    def x_data(self) -> np.ndarray:
+        """An array of the input x data, with dimensions ``(n_samples, n_features)``."""
+        return self._x_data
+
+    @x_data.setter
+    def x_data(self, x_data: np.ndarray | pd.DataFrame):
+        if isinstance(x_data, pd.DataFrame):
+            x_data = x_data.to_numpy()
+        self._x_data = x_data
+
+    @property
+    def y_data_true(self) -> np.ndarray:
+        """An array of the true output y labels, with dimensions ``(n_samples, 1)``."""
+        return self._y_data_true
+
+    @y_data_true.setter
+    def y_data_true(self, y_data_true: np.ndarray | pd.Series | list[int] | None):
+        if y_data_true is None:
+            y_data_true = np.array([1] * self.x_data.shape[0])
+        elif isinstance(y_data_true, pd.Series):
+            y_data_true = y_data_true.to_numpy()
+        elif isinstance(y_data_true, list):
+            y_data_true = np.array(y_data_true)
+        self._y_data_true = y_data_true
+
+    @property
+    def y_data_pred(self) -> np.ndarray | None:
+        """An array of the predicted output y labels, with dimensions ``(n_samples, 1)``."""
+        return self._y_data_pred
+
+    @y_data_pred.setter
+    def y_data_pred(self, y_data_pred: np.ndarray | pd.Series | list[int] | None):
+        if isinstance(y_data_pred, pd.Series):
+            y_data_pred = y_data_pred.to_numpy()
+        elif isinstance(y_data_pred, list):
+            y_data_pred = np.array(y_data_pred)
+        self._y_data_pred = y_data_pred
+
+    @property
+    def do_prune_local(self) -> bool:
+        return self._do_prune_local
+
+    @do_prune_local.setter
+    def do_prune_local(self, do_prune_local: bool | None):
+        if do_prune_local is None:
+            if self.x_data.shape[0] > 300000:
+                logger.message(
+                    f"Sample size is {self.x_data.shape[0]}, setting do_prune_local "
+                    f"to False so that local pruning will be skipped and algorithm will be faster."
+                )
+                do_prune_local = False
             else:
-                keep_all_local_dist = False
-        if resolution_parameter !=1:
-            partition_type = "RBVP" # Reichardt and Bornholdt’s Potts model. Note that this is the same as ModularityVertexPartition when setting 𝛾 = 1 and normalising by 2m
-        self.data = data
-        self.true_label = true_label
-        self.dist_std_local = dist_std_local   # similar to the jac_std_global parameter. avoid setting local and global pruning to both be below 0.5 as this is very aggresive pruning.
-        self.jac_std_global = jac_std_global  #0.15 is also a recommended value performing empirically similar to 'median'. Generally values between 0-1.5 are reasonable.
-        self.keep_all_local_dist = keep_all_local_dist #decides whether or not to do local pruning. default is 'auto' which omits LOCAL pruning for samples >300,000 cells.
-        self.too_big_factor = too_big_factor  #if a cluster exceeds this share of the entire cell population, then the PARC will be run on the large cluster. at 0.4 it does not come into play
-        self.small_pop = small_pop  # smallest cluster population to be considered a community
-        self.jac_weighted_edges = jac_weighted_edges #boolean. whether to partition using weighted graph
-        self.knn = knn
-        self.n_iter_leiden = n_iter_leiden #the default is 5 in PARC
-        self.random_seed = random_seed  # enable reproducible Leiden clustering
-        self.num_threads = num_threads  # number of threads used in KNN search/construction
-        self.distance = distance  # Euclidean distance 'l2' by default; other options 'ip' and 'cosine'
-        self.time_smallpop = time_smallpop #number of seconds trying to check an outlier
-        self.partition_type = partition_type #default is the simple ModularityVertexPartition where resolution_parameter =1. In order to change resolution_parameter, we switch to RBConfigurationVP
-        self.resolution_parameter = resolution_parameter # defaults to 1. expose this parameter in leidenalg
-        self.knn_struct = knn_struct #the hnsw index of the KNN graph on which we perform queries
-        self.neighbor_graph = neighbor_graph # CSR affinity matrix for pre-computed nearest neighbors
-        self.hnsw_param_ef_construction = hnsw_param_ef_construction #set at 150. higher value increases accuracy of index construction. Even for several 100,000s of cells 150-200 is adequate
-
-    def make_knn_struct(self, too_big=False, big_cluster=None):
-        if self.knn > 190: print('consider using a lower K_in for KNN graph construction')
-        ef_query = max(100, self.knn + 1)  # ef always should be >K. higher ef, more accurate query
-        if too_big == False:
-            num_dims = self.data.shape[1]
-            n_elements = self.data.shape[0]
-            p = hnswlib.Index(space=self.distance, dim=num_dims)  # default to Euclidean distance
-            p.set_num_threads(self.num_threads)  # allow user to set threads used in KNN construction
-            if n_elements < 10000:
-                ef_query = min(n_elements - 10, 500)
-                ef_construction = ef_query
+                do_prune_local = True
+
+        self._do_prune_local = do_prune_local
+
+    @property
+    def partition_type(self) -> str:
+        return self._partition_type
+
+    @partition_type.setter
+    def partition_type(self, partition_type: str):
+        if self.resolution_parameter != 1:
+            self._partition_type = "RBVP"
+        else:
+            self._partition_type = partition_type
+
+    @property
+    def community_counts(self) -> pd.DataFrame:
+        """A dataframe containing the community counts.
+        
+        Returns:
+            A dataframe with the following columns:
+
+            * ``community_id``: The community ID.
+            * ``count``: The number of samples in the community.
+        """
+        return self._community_counts
+    
+    @community_counts.setter
+    def community_counts(self, community_counts: pd.DataFrame):
+        self._community_counts = community_counts
+
+    def create_hnsw_index(
+        self,
+        x_data: np.ndarray,
+        knn: int,
+        ef_query: int | None = None,
+        hnsw_param_m: int | None = None,
+        hnsw_param_ef_construction: int | None = None,
+        distance_metric: str = "l2",
+        n_threads: int | None = None
+    ) -> hnswlib.Index:
+        """Create a KNN graph using the Hierarchical Navigable Small Worlds (HNSW) algorithm.
+
+        See `hnswlib.Index
+        <https://github.com/nmslib/hnswlib/blob/master/python_bindings/LazyIndex.py>`__.
+
+        Args:
+            x_data:
+                An array of the input x data, with dimensions ``(n_samples, n_features)``.
+            knn:
+                The number of nearest neighbors k for the k-nearest neighbors algorithm.
+            ef_query:
+                The ``ef_query`` parameter corresponds to the ``hnswlib.Index`` parameter ``ef``.
+                It determines the size of the dynamic list for the nearest neighbors
+                (used during the search). Higher ``ef`` leads to more accurate but slower search.
+                Must be a value in the interval ``(k, n_samples]``.
+            hnsw_param_m:
+                The ``hnsw_param_m`` parameter corresponds to the ``hnswlib.Index`` parameter ``M``.
+                It corresponds to the number of bi-directional links created for every new element
+                during the ``hnswlib.Index`` construction. Reasonable range for ``M`` is ``2-100``.
+                Higher ``M`` works better on datasets with high intrinsic dimensionality and/or
+                high recall, while lower ``M`` works better for datasets with low intrinsic
+                dimensionality and/or low recall. The parameter also determines the algorithm's
+                memory consumption, which is roughly ``M * 8-10 bytes`` per stored element.
+
+                For example, for ``n_features=4`` random vectors, the optimal ``M`` for search
+                is somewhere around ``6``, while for high dimensional datasets
+                (word embeddings, good face descriptors, scRNA seq), higher values of ``M``
+                are required (e.g. ``M=48-64``) for optimal performance at high recall.
+                The range ``M=12-48`` is adequate for the most of the use cases.
+                When ``M`` is changed, one has to update the other parameters.
+                Nonetheless, ``ef`` and ``ef_construction`` parameters can be roughly estimated
+                by assuming that ``M*ef_construction`` is a constant.
+            hnsw_param_ef_construction:
+                The ``hnsw_param_ef_construction`` parameter corresponds to the ``hnswlib.Index``
+                parameter ``ef_construction``. It has the same meaning as ``ef_query``,
+                but controls the index_time/index_accuracy. Higher values lead to longer
+                construction, but better index quality. Even for ``O(100 000)`` cells,
+                ``ef_construction ~ 150-200`` is adequate.
+
+                At some point, increasing ``ef_construction`` does not improve the quality of
+                the index. One way to check if the selection of ``ef_construction`` is
+                appropriate is to measure a recall for ``M`` nearest neighbor search when
+                ``ef = ef_construction``: if the recall is lower than ``0.9``, then there is room
+                for improvement.
+            distance_metric:
+                The distance metric to be used in the KNN algorithm:
+
+                * ``l2``: Euclidean distance L^2 norm:
+
+                  .. code-block:: python
+
+                    d = np.sum((x_i - y_i)**2)
+
+                * ``cosine``: cosine similarity
+
+                  .. code-block:: python
+
+                    d = 1.0 - np.sum(x_i*y_i) / np.sqrt(np.sum(x_i*x_i) * np.sum(y_i*y_i))
+
+            n_threads:
+                The number of threads used in the KNN algorithm.
+
+        Returns:
+            The HNSW index of the k-nearest neighbors graph.
+        """
+
+        n_features = x_data.shape[1]
+        n_samples = x_data.shape[0]
+
+        hnsw_index = hnswlib.Index(space=distance_metric, dim=n_features)
+
+        if knn > 190:
+            logger.message(
+                f"knn is {knn}, consider using a lower K_in for KNN graph construction"
+            )
+
+        if ef_query is None:
+            if n_samples < 10000:
+                ef_query = min(n_samples - 10, 500)
             else:
-                ef_construction = self.hnsw_param_ef_construction
-            if (num_dims > 30) & (n_elements<=50000) :
-                p.init_index(max_elements=n_elements, ef_construction=ef_construction,
-                             M=48)  ## good for scRNA seq where dimensionality is high
+                ef_query = 100
+
+        ef_query = min(max(ef_query, knn + 1), n_samples)
+
+        if n_threads is not None:
+            hnsw_index.set_num_threads(n_threads)
+
+        if hnsw_param_m is None:
+            if n_features > 30 and n_samples <= 50000:
+                hnsw_param_m = 48
             else:
-                p.init_index(max_elements=n_elements, ef_construction=ef_construction, M=24 ) #30
-            p.add_items(self.data)
-        if too_big == True:
-            num_dims = big_cluster.shape[1]
-            n_elements = big_cluster.shape[0]
-            p = hnswlib.Index(space='l2', dim=num_dims)
-            p.init_index(max_elements=n_elements, ef_construction=200, M=30)
-            p.add_items(big_cluster)
-        p.set_ef(ef_query)  # ef should always be > k
-
-        return p
-
-    def knngraph_full(self):#, neighbor_array, distance_array):
-        k_umap = 15
-        t0= time.time()
+                hnsw_param_m = 24
+
+        if hnsw_param_ef_construction is None:
+            if n_samples < 10000:
+                hnsw_param_ef_construction = min(n_samples - 10, 500)
+            else:
+                hnsw_param_ef_construction = self.hnsw_param_ef_construction
+
+        hnsw_index.init_index(
+            max_elements=n_samples,
+            ef_construction=hnsw_param_ef_construction,
+            M=hnsw_param_m
+        )
+        hnsw_index.add_items(x_data)
+        hnsw_index.set_ef(ef_query)
+
+        return hnsw_index
+
+    def create_knn_graph(self, knn: int = 15) -> csr_matrix:
+        """Create a full k-nearest neighbors graph using the HNSW algorithm.
+
+        Args:
+            knn: The number of nearest neighbors k for the k-nearest neighbours algorithm.
+
+        Returns:
+            A compressed sparse row matrix with dimensions ``(n_samples, n_samples)``,
+            containing the pruned distances.
+        """
+
         # neighbors in array are not listed in in any order of proximity
-        self.knn_struct.set_ef(k_umap+1)
-        neighbor_array, distance_array = self.knn_struct.knn_query(self.data, k=k_umap)
+        self.hnsw_index.set_ef(knn + 1)
+        neighbor_array, distance_array = self.hnsw_index.knn_query(self.x_data, k=knn)
 
         row_list = []
         n_neighbors = neighbor_array.shape[1]
-        n_cells = neighbor_array.shape[0]
-
-        row_list.extend(list(np.transpose(np.ones((n_neighbors, n_cells)) * range(0, n_cells)).flatten()))
+        n_samples = neighbor_array.shape[0]
 
+        row_list.extend(
+            list(np.transpose(np.ones((n_neighbors, n_samples)) * range(0, n_samples)).flatten())
+        )
 
         row_min = np.min(distance_array, axis=1)
         row_sigma = np.std(distance_array, axis=1)
 
-        distance_array = (distance_array - row_min[:,np.newaxis])/row_sigma[:,np.newaxis]
+        distance_array = (distance_array - row_min[:, np.newaxis]) / row_sigma[:, np.newaxis]
+        distance_array = np.sqrt(distance_array.flatten()) * -1
 
         col_list = neighbor_array.flatten().tolist()
-        distance_array = distance_array.flatten()
-        distance_array = np.sqrt(distance_array)
-        distance_array = distance_array * -1
 
         weight_list = np.exp(distance_array)
+        threshold = np.mean(weight_list) + 2 * np.std(weight_list)
+        weight_list[weight_list >= threshold] = threshold
 
+        csr_array = csr_matrix(
+            (weight_list, (np.array(row_list), np.array(col_list))),
+            shape=(n_samples, n_samples)
+        )
+        prod_matrix = csr_array.multiply(csr_array.T)
+        csr_array = csr_array.T + csr_array - prod_matrix
+        return csr_array
 
-        threshold = np.mean(weight_list) + 2* np.std(weight_list)
+    def prune_local(
+        self,
+        neighbor_array: np.ndarray,
+        distance_array: np.ndarray,
+        l2_std_factor: float | None = None
+    ) -> csr_matrix:
+        """Prune the nearest neighbors array.
 
-        weight_list[weight_list >= threshold] = threshold
+        If ``do_prune_local==True``, remove any neighbors which are further away than
+        the specified cutoff distance. Also, remove any self-loops. Return in the ``csr_matrix``
+        format.
 
-        weight_list = weight_list.tolist()
+        If ``do_prune_local==False``, then don't perform any pruning and return the original
+        arrays in the ``csr_matrix`` format.
 
+        Args:
+            neighbor_array: An array with dimensions ``(n_samples, k)`` listing the
+                k nearest neighbors for each data point.
 
-        graph = csr_matrix((np.array(weight_list), (np.array(row_list), np.array(col_list))),
-                           shape=(n_cells, n_cells))
+                .. note::
+                    The neighbors in the array are not listed in any order of proximity.
 
-        graph_transpose = graph.T
-        prod_matrix = graph.multiply(graph_transpose)
+            distance_array: An array with dimensions ``(n_samples, k)`` listing the
+                distances to each of the k nearest neighbors for each data point.
+            l2_std_factor: The multiplier used in calculating the Euclidean distance threshold
+                for the distance between two nodes during local pruning. If ``None`` (default),
+                then the value is set to the value of ``self.l2_std_factor``.
 
-        graph = graph_transpose + graph - prod_matrix
-        return graph
+        Returns:
+            A compressed sparse row matrix with dimensions ``(n_samples, n_samples)``,
+            containing the pruned distances.
+        """
 
-    def make_csrmatrix_noselfloop(self, neighbor_array, distance_array):
-        # neighbor array not listed in in any order of proximity
         row_list = []
         col_list = []
         weight_list = []
 
         n_neighbors = neighbor_array.shape[1]
-        n_cells = neighbor_array.shape[0]
-        rowi = 0
-        discard_count = 0
-        if self.keep_all_local_dist == False:  # locally prune based on (squared) l2 distance
+        n_samples = neighbor_array.shape[0]
 
-            print('commencing local pruning based on Euclidean distance metric at',
-                  self.dist_std_local, 's.dev above mean')
+        if l2_std_factor is None:
+            l2_std_factor = self.l2_std_factor
+        else:
+            self.l2_std_factor = l2_std_factor
+
+        if self.do_prune_local:
+            logger.message(
+                "Starting local pruning based on Euclidean distance metric at "
+                f"{self.l2_std_factor} standard deviations above the mean"
+            )
             distance_array = distance_array + 0.1
-            for row in neighbor_array:
-                distlist = distance_array[rowi, :]
-                to_keep = np.where(distlist < np.mean(distlist) + self.dist_std_local * np.std(distlist))[0]  # 0*std
-                updated_nn_ind = row[np.ix_(to_keep)]
-                updated_nn_weights = distlist[np.ix_(to_keep)]
-                discard_count = discard_count + (n_neighbors - len(to_keep))
-
-                for ik in range(len(updated_nn_ind)):
-                    if rowi != row[ik]:  # remove self-loops
-                        row_list.append(rowi)
-                        col_list.append(updated_nn_ind[ik])
-                        dist = np.sqrt(updated_nn_weights[ik])
-                        weight_list.append(1/(dist+0.1))
-
-                rowi = rowi + 1
-
-        if self.keep_all_local_dist == True:  # dont prune based on distance
-            row_list.extend(list(np.transpose(np.ones((n_neighbors, n_cells)) * range(0, n_cells)).flatten()))
+            for community_id, neighbors in zip(range(n_samples), neighbor_array):
+                distances = distance_array[community_id, :]
+                max_distance = np.mean(distances) + self.l2_std_factor * np.std(distances)
+                to_keep = np.where(distances < max_distance)[0]
+                updated_neighbors = neighbors[np.ix_(to_keep)]
+                updated_distances = distances[np.ix_(to_keep)]
+
+                # remove self-loops
+                for index in range(len(updated_neighbors)):
+                    if community_id != neighbors[index]:
+                        row_list.append(community_id)
+                        col_list.append(updated_neighbors[index])
+                        distance = np.sqrt(updated_distances[index])
+                        weight_list.append(1 / (distance + 0.1))
+        else:
+            row_list.extend(
+                list(np.transpose(np.ones((n_neighbors, n_samples)) * range(n_samples)).flatten())
+            )
             col_list = neighbor_array.flatten().tolist()
-            weight_list = (1. / (distance_array.flatten() + 0.1)).tolist()
-
-        csr_graph = csr_matrix((np.array(weight_list), (np.array(row_list), np.array(col_list))),
-                               shape=(n_cells, n_cells))
-        return csr_graph
-
-    def func_mode(self, ll):  # return MODE of list
-        # If multiple items are maximal, the function returns the first one encountered.
-        return max(set(ll), key=ll.count)
-
-    def run_toobig_subPARC(self, X_data, jac_std_toobig=0.3,
-                           jac_weighted_edges=True):
-        n_elements = X_data.shape[0]
-        hnsw = self.make_knn_struct(too_big=True, big_cluster=X_data)
-        if n_elements <= 10: print('consider increasing the too_big_factor')
-        if n_elements > self.knn:
-            knnbig = self.knn
+            weight_list = (1.0 / (distance_array.flatten() + 0.1)).tolist()
+
+        csr_array = csr_matrix(
+            (np.array(weight_list), (np.array(row_list), np.array(col_list))),
+            shape=(n_samples, n_samples)
+        )
+        return csr_array
+
+    def prune_global(
+        self,
+        csr_array: csr_matrix,
+        jac_threshold_type: str,
+        jac_std_factor: float,
+        jac_weighted_edges: bool,
+        n_samples: int
+    ) -> ig.Graph:
+        """Prune the graph globally based on the Jaccard similarity measure.
+
+        The ``csr_array`` contains the locally-pruned pairwise distances. From this, we can
+        use the Jaccard similarity metric to compute the similarity score for each edge. We then
+        remove any edges from the graph that do not meet a minimum similarity threshold.
+
+        Args:
+            csr_array: A sparse matrix with dimensions ``(n_samples, n_samples)``,
+                containing the locally-pruned pair-wise distances.
+            jac_threshold_type: One of ``"median"`` or ``"mean"``. Determines how the
+                Jaccard similarity threshold is calculated during global pruning.
+            jac_std_factor: The multiplier used in calculating the Jaccard similarity
+                threshold for the similarity between two nodes during global pruning for
+                ``jac_threshold_type = "mean"``:
+
+                .. code-block:: python
+
+                    threshold = np.mean(similarities) - jac_std_factor * np.std(similarities)
+
+                Setting ``jac_std_factor = 0.15`` and ``jac_threshold_type="mean"`` performs
+                empirically similar to ``jac_threshold_type="median"``, which does not use the
+                ``jac_std_factor``. Generally values between 0-1.5 are reasonable. Higher
+                ``jac_std_factor`` means more edges are kept.
+            jac_weighted_edges: Whether to weight the pruned graph. This is always ``True`` for
+                the top-level ``PARC`` run, but can be changed when pruning the large communities.
+            n_samples: The number of samples in the data.
+
+        Returns:
+            A ``Graph`` object which has now been locally and globally pruned.
+        """
+
+        logger.message("Starting global pruning...")
+
+        input_nodes, output_nodes = csr_array.nonzero()
+        edges = list(zip(input_nodes.tolist(), output_nodes.tolist()))
+        edges_copy = edges.copy()
+
+        logger.info(f"Creating graph with {len(edges)} edges and {n_samples} nodes...")
+
+        graph = ig.Graph(edges, edge_attrs={"weight": csr_array.data.tolist()})
+        similarities = np.asarray(graph.similarity_jaccard(pairs=edges_copy))
+
+        if jac_threshold_type == "median":
+            threshold = np.median(similarities)
+        else:
+            threshold = np.mean(similarities) - jac_std_factor * np.std(similarities)
+
+        indices_similar = np.where(similarities > threshold)[0]
+
+        logger.info(
+            f"Pruning {len(edges) - len(indices_similar)} edges based on Jaccard similarity "
+            f"threshold of {threshold:.3f} "
+            f"(mean = {np.mean(similarities):.3f}, std = {np.std(similarities):.3f}) "
+        )
+        logger.message(f"Creating graph with {len(indices_similar)} edges and {n_samples} nodes...")
+
+        if jac_weighted_edges:
+            graph_pruned = ig.Graph(
+                n=n_samples,
+                edges=list(np.asarray(edges_copy)[indices_similar]),
+                edge_attrs={"weight": list(similarities[indices_similar])}
+            )
         else:
-            knnbig = int(max(5, 0.2 * n_elements))
-
-        neighbor_array, distance_array = hnsw.knn_query(X_data, k=knnbig)
-        # print('shapes of neigh and dist array', neighbor_array.shape, distance_array.shape)
-        csr_array = self.make_csrmatrix_noselfloop(neighbor_array, distance_array)
-        sources, targets = csr_array.nonzero()
-        #mask = np.zeros(len(sources), dtype=bool)
-
-        #mask |= (csr_array.data < (np.mean(csr_array.data) - np.std(csr_array.data) * 5))  # weights are 1/dist so bigger weight means closer nodes
-
-        #csr_array.data[mask] = 0
-        #csr_array.eliminate_zeros()
-        #sources, targets = csr_array.nonzero()
-        edgelist = list(zip(sources.tolist(), targets.tolist()))
-        edgelist_copy = edgelist.copy()
-        G = ig.Graph(edgelist, edge_attrs={'weight': csr_array.data.tolist()})
-        sim_list = G.similarity_jaccard(pairs=edgelist_copy)  # list of jaccard weights
-        new_edgelist = []
-        sim_list_array = np.asarray(sim_list)
-        if jac_std_toobig == 'median':
-            threshold = np.median(sim_list)
+            graph_pruned = ig.Graph(
+                n=n_samples,
+                edges=list(np.asarray(edges_copy)[indices_similar])
+            )
+
+        graph_pruned.simplify(combine_edges="sum")  # "first"
+        return graph_pruned
+
+    def get_leiden_partition(
+        self,
+        graph: ig.Graph,
+        jac_weighted_edges: bool = True
+    ) -> leidenalg.VertexPartition:
+        """Partition the graph using the Leiden algorithm.
+
+        A partition is a set of communities.
+
+        Args:
+            graph: A ``Graph`` object which has been locally and globally pruned.
+            jac_weighted_edges: Whether to partition using the weighted graph.
+
+        Returns:
+            A partition object.
+            See `leidenalg.VertexPartition on GitHub
+            <https://github.com/vtraag/leidenalg/blob/main/src/leidenalg/VertexPartition.py>`_.
+        """
+
+        if jac_weighted_edges:
+            weights = "weight"
         else:
-            threshold = np.mean(sim_list) - jac_std_toobig * np.std(sim_list)
-        print('jac threshold %.3f' % threshold)
-        print('jac std %.3f' % np.std(sim_list))
-        print('jac mean %.3f' % np.mean(sim_list))
-        strong_locs = np.where(sim_list_array > threshold)[0]
-        for ii in strong_locs: new_edgelist.append(edgelist_copy[ii])
-        sim_list_new = list(sim_list_array[strong_locs])
-
-        if jac_weighted_edges == True:
-            G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist), edge_attrs={'weight': sim_list_new})
+            weights = None
+
+        if self.partition_type == "ModularityVP":
+            logger.message(
+                "Leiden algorithm find partition: partition type = ModularityVertexPartition"
+            )
+            partition = leidenalg.find_partition(
+                graph=graph,
+                partition_type=leidenalg.ModularityVertexPartition,
+                weights=weights,
+                n_iterations=self.n_iter_leiden,
+                seed=self.random_seed
+            )
         else:
-            G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist))
-        G_sim.simplify(combine_edges='sum')
-        if jac_weighted_edges == True:
-            if self.partition_type =='ModularityVP':
-                partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, weights='weight',
-                                                 n_iterations=self.n_iter_leiden, seed=self.random_seed)
-                print('partition type MVP')
-            else:
-                partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, weights='weight',
-                                                 n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter=self.resolution_parameter)
-                print('partition type RBC')
+            logger.message(
+                "Leiden algorithm find partition: partition type = RBConfigurationVertexPartition"
+            )
+            partition = leidenalg.find_partition(
+                graph=graph,
+                partition_type=leidenalg.RBConfigurationVertexPartition,
+                weights=weights,
+                n_iterations=self.n_iter_leiden,
+                seed=self.random_seed,
+                resolution_parameter=self.resolution_parameter
+            )
+        return partition
+
+    def run_toobig_subPARC(
+        self,
+        x_data,
+        jac_threshold_type: str = "mean",
+        jac_std_factor: float = 0.3,
+        jac_weighted_edges=True
+    ):
+
+        n_samples = x_data.shape[0]
+        hnsw_index = self.create_hnsw_index(
+            x_data=x_data,
+            knn=self.knn,
+            hnsw_param_m=30,
+            hnsw_param_ef_construction=200,
+            distance_metric="l2",
+            ef_query=100
+        )
+        if n_samples <= 10:
+            logger.message(
+                f"Large community is small with only {n_samples} nodes. "
+                f"Consider increasing the large_community_factor = {self.large_community_factor}."
+            )
+        if n_samples > self.knn:
+            knnbig = self.knn
         else:
-            if self.partition_type == 'ModularityVP':
-                print('partition type MVP')
-                partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition,
-                                                 n_iterations=self.n_iter_leiden, seed=self.random_seed)
-            else:
-                print('partition type RBC')
-                partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition,
-                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed,
-                                                     resolution_parameter=self.resolution_parameter)
-        # print('Q= %.2f' % partition.quality())
-        PARC_labels_leiden = np.asarray(partition.membership)
-        PARC_labels_leiden = np.reshape(PARC_labels_leiden, (n_elements, 1))
+            knnbig = int(max(5, 0.2 * n_samples))
+
+        neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knnbig)
+        csr_array = self.prune_local(neighbor_array, distance_array)
+
+        graph_pruned = self.prune_global(
+            csr_array=csr_array,
+            jac_threshold_type=jac_threshold_type,
+            jac_std_factor=jac_std_factor,
+            jac_weighted_edges=jac_weighted_edges,
+            n_samples=n_samples
+        )
+
+        partition = self.get_leiden_partition(
+            graph=graph_pruned,
+            jac_weighted_edges=jac_weighted_edges
+        )
+
+        node_communities = np.asarray(partition.membership)
+        node_communities = np.reshape(node_communities, (n_samples, 1))
         small_pop_list = []
         small_cluster_list = []
-        small_pop_exist = False
-        dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True)
-        for cluster in set(PARC_labels_leiden):
-            population = len(np.where(PARC_labels_leiden == cluster)[0])
-            if population < small_pop:
-                small_pop_exist = True
-                small_pop_list.append(list(np.where(PARC_labels_leiden == cluster)[0]))
+        small_community_exists = False
+        node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1]
+        logger.message("Stating small community detection...")
+        for cluster in set(node_communities):
+            population = len(np.where(node_communities == cluster)[0])
+            if population < self.small_community_size:
+                small_community_exists = True
+                small_pop_list.append(list(np.where(node_communities == cluster)[0]))
                 small_cluster_list.append(cluster)
 
         for small_cluster in small_pop_list:
             for single_cell in small_cluster:
                 old_neighbors = neighbor_array[single_cell, :]
-                group_of_old_neighbors = PARC_labels_leiden[old_neighbors]
+                group_of_old_neighbors = node_communities[old_neighbors]
                 group_of_old_neighbors = list(group_of_old_neighbors.flatten())
                 available_neighbours = set(group_of_old_neighbors) - set(small_cluster_list)
                 if len(available_neighbours) > 0:
                     available_neighbours_list = [value for value in group_of_old_neighbors if
                                                  value in list(available_neighbours)]
                     best_group = max(available_neighbours_list, key=available_neighbours_list.count)
-                    PARC_labels_leiden[single_cell] = best_group
+                    node_communities[single_cell] = best_group
 
-        time_smallpop_start = time.time()
-        print('handling fragments')
-        while (small_pop_exist) == True & (time.time() - time_smallpop_start < self.time_smallpop):
+        time_start = time.time()
+        while small_community_exists and (time.time() - time_start < self.small_community_timeout):
             small_pop_list = []
-            small_pop_exist = False
-            for cluster in set(list(PARC_labels_leiden.flatten())):
-                population = len(np.where(PARC_labels_leiden == cluster)[0])
-                if population < small_pop:
-                    small_pop_exist = True
+            small_community_exists = False
+            for cluster in set(list(node_communities.flatten())):
+                population = len(np.where(node_communities == cluster)[0])
+                if population < self.small_community_size:
+                    small_community_exists = True
 
-                    small_pop_list.append(np.where(PARC_labels_leiden == cluster)[0])
+                    small_pop_list.append(np.where(node_communities == cluster)[0])
             for small_cluster in small_pop_list:
                 for single_cell in small_cluster:
                     old_neighbors = neighbor_array[single_cell, :]
-                    group_of_old_neighbors = PARC_labels_leiden[old_neighbors]
+                    group_of_old_neighbors = node_communities[old_neighbors]
                     group_of_old_neighbors = list(group_of_old_neighbors.flatten())
                     best_group = max(set(group_of_old_neighbors), key=group_of_old_neighbors.count)
-                    PARC_labels_leiden[single_cell] = best_group
+                    node_communities[single_cell] = best_group
 
-        dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True)
+        node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1]
 
-        return PARC_labels_leiden
+        return node_communities
 
-    def run_subPARC(self):
+    def fit_predict(self, x_data: np.ndarray | None = None) -> np.ndarray:
+        """Fit the PARC algorithm to the input data and predict the output labels.
+        
+        Args:
+            x_data: An array of the input x data, with dimensions ``(n_samples, n_features)``.
+                If ``None``, then the ``x_data`` attribute will be used.
+                
+        Returns:
+            An array of the predicted output y labels, with dimensions ``(n_samples, 1)``.
+        """
 
+        time_start = time.time()
 
-        X_data = self.data
-        too_big_factor = self.too_big_factor
-        small_pop = self.small_pop
-        jac_std_global = self.jac_std_global
+        if x_data is None and self.x_data is None:
+            raise ValueError("x_data must be provided.")
+        elif x_data is None:
+            x_data = self.x_data
+        else:
+            self.x_data = x_data
+
+        n_samples = x_data.shape[0]
+        n_features = x_data.shape[1]
+        logger.message(
+            f"Input data has shape {n_samples} (samples) x {n_features} (features)"
+        )
+
+        large_community_factor = self.large_community_factor
+        small_community_size = self.small_community_size
+        jac_threshold_type = self.jac_threshold_type
+        jac_std_factor = self.jac_std_factor
         jac_weighted_edges = self.jac_weighted_edges
         knn = self.knn
-        n_elements = X_data.shape[0]
-
 
         if self.neighbor_graph is not None:
             csr_array = self.neighbor_graph
             neighbor_array = np.split(csr_array.indices, csr_array.indptr)[1:-1]
         else:
-            if self.knn_struct is None:
-                print('knn struct was not available, so making one')
-                self.knn_struct = self.make_knn_struct()
+            if self.hnsw_index is None:
+                logger.message("Creating HNSW index...")
+                self.hnsw_index = self.create_hnsw_index(
+                    x_data=x_data,
+                    knn=knn,
+                    distance_metric=self.distance_metric,
+                    n_threads=self.n_threads
+                )
             else:
-                print('knn struct already exists')
-            neighbor_array, distance_array = self.knn_struct.knn_query(X_data, k=knn)
-            csr_array = self.make_csrmatrix_noselfloop(neighbor_array, distance_array)
-
-        sources, targets = csr_array.nonzero()
-
-        edgelist = list(zip(sources, targets))
-
-        edgelist_copy = edgelist.copy()
-
-        G = ig.Graph(edgelist, edge_attrs={'weight': csr_array.data.tolist()})
-        # print('average degree of prejacard graph is %.1f'% (np.mean(G.degree())))
-        # print('computing Jaccard metric')
-        sim_list = G.similarity_jaccard(pairs=edgelist_copy)
-
-        print('commencing global pruning')
+                logger.message("HNSW index already exists.")
+            neighbor_array, distance_array = self.hnsw_index.knn_query(x_data, k=knn)
+            csr_array = self.prune_local(neighbor_array, distance_array)
+
+        graph_pruned = self.prune_global(
+            csr_array=csr_array,
+            jac_threshold_type=jac_threshold_type,
+            jac_std_factor=jac_std_factor,
+            jac_weighted_edges=True,
+            n_samples=n_samples
+        )
 
-        sim_list_array = np.asarray(sim_list)
-        edge_list_copy_array = np.asarray(edgelist_copy)
+        logger.message("Starting Leiden community detection...")
+        partition = self.get_leiden_partition(
+            graph=graph_pruned,
+            jac_weighted_edges=jac_weighted_edges
+        )
 
-        if jac_std_global == 'median':
-            threshold = np.median(sim_list)
-        else:
-            threshold = np.mean(sim_list) - jac_std_global * np.std(sim_list)
-        strong_locs = np.where(sim_list_array > threshold)[0]
-        # print('Share of edges kept after Global Pruning %.2f' % (len(strong_locs) / len(sim_list)), '%')
-        new_edgelist = list(edge_list_copy_array[strong_locs])
-        sim_list_new = list(sim_list_array[strong_locs])
-
-        G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist), edge_attrs={'weight': sim_list_new})
-        # print('average degree of graph is %.1f' % (np.mean(G_sim.degree())))
-        G_sim.simplify(combine_edges='sum')  # "first"
-        # print('average degree of SIMPLE graph is %.1f' % (np.mean(G_sim.degree())))
-        print('commencing community detection')
-        if jac_weighted_edges == True:
-            start_leiden = time.time()
-            if self.partition_type =='ModularityVP':
-                print('partition type MVP')
-                partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, weights='weight',
-                                                 n_iterations=self.n_iter_leiden, seed=self.random_seed)
-            else:
-                print('partition type RBC')
-                partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, weights='weight',
-                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter = self.resolution_parameter)
-            #print(time.time() - start_leiden)
-        else:
-            start_leiden = time.time()
-            if self.partition_type == 'ModularityVP':
-                partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition,
-                                                 n_iterations=self.n_iter_leiden, seed=self.random_seed)
-                print('partition type MVP')
-            else:
-                partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition,
-                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter = self.resolution_parameter)
-                print('partition type RBC')
-            # print(time.time() - start_leiden)
-        time_end_PARC = time.time()
-        # print('Q= %.1f' % (partition.quality()))
-        PARC_labels_leiden = np.asarray(partition.membership)
-        PARC_labels_leiden = np.reshape(PARC_labels_leiden, (n_elements, 1))
+        node_communities = np.asarray(partition.membership)
+        node_communities = np.reshape(node_communities, (n_samples, 1))
 
         too_big = False
 
-        # print('labels found after Leiden', set(list(PARC_labels_leiden.T)[0])) will have some outlier clusters that need to be added to a cluster if a cluster has members that are KNN
-
-        cluster_i_loc = np.where(PARC_labels_leiden == 0)[
-            0]  # the 0th cluster is the largest one. so if cluster 0 is not too big, then the others wont be too big either
-        pop_i = len(cluster_i_loc)
-        if pop_i > too_big_factor * n_elements:  # 0.4
+        # The 0th cluster is the largest one.
+        # So, if cluster 0 is not too big, then the others won't be too big either
+        large_community_id = 0
+        community_indices = np.where(node_communities == large_community_id)[0]
+        community_size = len(community_indices)
+
+        if community_size > large_community_factor * n_samples:
+            logger.message(
+                f"\nCommunity 0 is too large and has size:\n"
+                f"{community_size} > large_community_factor * n_samples = "
+                f"{large_community_factor} * {n_samples} = {large_community_factor * n_samples}\n"
+                f"Starting large community expansion..."
+            )
             too_big = True
-            cluster_big_loc = cluster_i_loc
-            list_pop_too_bigs = [pop_i]
-            cluster_too_big = 0
-
-        while too_big == True:
-
-            X_data_big = X_data[cluster_big_loc, :]
-            PARC_labels_leiden_big = self.run_toobig_subPARC(X_data_big)
-            # print('set of new big labels ', set(PARC_labels_leiden_big.flatten()))
-            PARC_labels_leiden_big = PARC_labels_leiden_big + 100000
-            # print('set of new big labels +100000 ', set(list(PARC_labels_leiden_big.flatten())))
-            pop_list = []
-
-            for item in set(list(PARC_labels_leiden_big.flatten())):
-                pop_list.append([item, list(PARC_labels_leiden_big.flatten()).count(item)])
-            print('pop of big clusters', pop_list)
+            large_community_indices = community_indices
+            list_pop_too_bigs = [community_size]
+        else:
+            logger.message(
+                f"\nCommunity 0 is not too large and has size:\n"
+                f"{community_size} <= large_community_factor * n_samples = "
+                f"{large_community_factor} * {n_samples} = {large_community_factor * n_samples}\n"
+                "Skipping large community expansion."
+            )
+
+        while too_big:
+            logger.message(f"Expanding large community {large_community_id}...")
+            node_communities_big = self.run_toobig_subPARC(
+                x_data=x_data[large_community_indices, :]
+            )
+            node_communities_big = node_communities_big + 100000
+
             jj = 0
-            print('shape PARC_labels_leiden', PARC_labels_leiden.shape)
-            for j in cluster_big_loc:
-                PARC_labels_leiden[j] = PARC_labels_leiden_big[jj]
+            for j in large_community_indices:
+                node_communities[j] = node_communities_big[jj]
                 jj = jj + 1
-            dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True)
-            print('new set of labels ', set(PARC_labels_leiden))
+            node_communities = np.unique(
+                list(node_communities.flatten()), return_inverse=True
+            )[1]
+
             too_big = False
-            set_PARC_labels_leiden = set(PARC_labels_leiden)
-
-            PARC_labels_leiden = np.asarray(PARC_labels_leiden)
-            for cluster_ii in set_PARC_labels_leiden:
-                cluster_ii_loc = np.where(PARC_labels_leiden == cluster_ii)[0]
-                pop_ii = len(cluster_ii_loc)
-                not_yet_expanded = pop_ii not in list_pop_too_bigs
-                if pop_ii > too_big_factor * n_elements and not_yet_expanded == True:
+            node_communities = np.asarray(node_communities)
+            for community_id in set(node_communities):
+                community_indices = np.where(node_communities == community_id)[0]
+                community_size = len(community_indices)
+                not_yet_expanded = community_size not in list_pop_too_bigs
+                if community_size > large_community_factor * n_samples and not_yet_expanded:
                     too_big = True
-                    print('cluster', cluster_ii, 'is too big and has population', pop_ii)
-                    cluster_big_loc = cluster_ii_loc
-                    cluster_big = cluster_ii
-                    big_pop = pop_ii
-            if too_big == True:
-                list_pop_too_bigs.append(big_pop)
-                print('cluster', cluster_big, 'is too big with population', big_pop, '. It will be expanded')
-        dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True)
+                    logger.message(
+                        f"Community {community_id} is too big and has population {community_size}."
+                    )
+                    large_community_indices = community_indices
+                    large_community_id = community_id
+                    large_community_size = community_size
+            if too_big:
+                list_pop_too_bigs.append(large_community_size)
+                logger.message(
+                    f"Community {large_community_id} is too big and has population "
+                    f"{large_community_size}. It will be expanded."
+                )
+        node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1]
+
+        logger.message("Starting small community detection...")
         small_pop_list = []
         small_cluster_list = []
-        small_pop_exist = False
-
-        for cluster in set(PARC_labels_leiden):
-            population = len(np.where(PARC_labels_leiden == cluster)[0])
-
-            if population < small_pop:  # 10
-                small_pop_exist = True
-
-                small_pop_list.append(list(np.where(PARC_labels_leiden == cluster)[0]))
+        small_community_exists = False
+
+        for cluster in set(node_communities):
+            population = len(np.where(node_communities == cluster)[0])
+            if population < small_community_size:
+                logger.info(
+                    f"Community {cluster} is a small community with population {population}"
+                )
+                small_community_exists = True
+                small_pop_list.append(list(np.where(node_communities == cluster)[0]))
                 small_cluster_list.append(cluster)
 
         for small_cluster in small_pop_list:
 
             for single_cell in small_cluster:
                 old_neighbors = neighbor_array[single_cell]
-                group_of_old_neighbors = PARC_labels_leiden[old_neighbors]
+                group_of_old_neighbors = node_communities[old_neighbors]
                 group_of_old_neighbors = list(group_of_old_neighbors.flatten())
                 available_neighbours = set(group_of_old_neighbors) - set(small_cluster_list)
                 if len(available_neighbours) > 0:
                     available_neighbours_list = [value for value in group_of_old_neighbors if
                                                  value in list(available_neighbours)]
                     best_group = max(available_neighbours_list, key=available_neighbours_list.count)
-                    PARC_labels_leiden[single_cell] = best_group
-        time_smallpop_start = time.time()
-        while (small_pop_exist == True) & ((time.time() - time_smallpop_start) < self.time_smallpop):
+                    node_communities[single_cell] = best_group
+        time_start_sc = time.time()
+        while small_community_exists and (time.time() - time_start_sc) < self.small_community_timeout:
             small_pop_list = []
-            small_pop_exist = False
-            for cluster in set(list(PARC_labels_leiden.flatten())):
-                population = len(np.where(PARC_labels_leiden == cluster)[0])
-                if population < small_pop:
-                    small_pop_exist = True
-                    print(cluster, ' has small population of', population, )
-                    small_pop_list.append(np.where(PARC_labels_leiden == cluster)[0])
+            small_community_exists = False
+            for cluster in set(list(node_communities.flatten())):
+                population = len(np.where(node_communities == cluster)[0])
+                if population < small_community_size:
+                    logger.info(
+                        f"Community {cluster} is a small community with population {population}"
+                    )
+                    small_community_exists = True
+                    small_pop_list.append(np.where(node_communities == cluster)[0])
             for small_cluster in small_pop_list:
                 for single_cell in small_cluster:
                     old_neighbors = neighbor_array[single_cell]
-                    group_of_old_neighbors = PARC_labels_leiden[old_neighbors]
+                    group_of_old_neighbors = node_communities[old_neighbors]
                     group_of_old_neighbors = list(group_of_old_neighbors.flatten())
                     best_group = max(set(group_of_old_neighbors), key=group_of_old_neighbors.count)
-                    PARC_labels_leiden[single_cell] = best_group
+                    node_communities[single_cell] = best_group
+
+        node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1]
+        node_communities = list(node_communities.flatten())
 
-        dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True)
-        PARC_labels_leiden = list(PARC_labels_leiden.flatten())
-        # print('final labels allocation', set(PARC_labels_leiden))
-        pop_list = []
-        for item in set(PARC_labels_leiden):
-            pop_list.append((item, PARC_labels_leiden.count(item)))
-        print('list of cluster labels and populations', len(pop_list), pop_list)
+        community_counts = pd.DataFrame({
+            "community_id": list(set(node_communities)),
+            "count": [
+                node_communities.count(community_id) for community_id in set(node_communities)
+            ]
+        })
+        logger.message(f"Community labels and sizes:\n{community_counts}")
 
-        self.labels = PARC_labels_leiden  # list
-        return
+        self.y_data_pred = node_communities
+        run_time = time.time() - time_start
+        logger.message(f"Time elapsed to run PARC: {run_time:.1f} seconds")
+        self.compute_performance_metrics(run_time)
+        return self.y_data_pred
 
-    def accuracy(self, onevsall=1):
+    def accuracy(self, target=1):
 
-        true_labels = self.true_label
+        y_data_true = self.y_data_true
         Index_dict = {}
-        PARC_labels = self.labels
-        N = len(PARC_labels)
-        n_cancer = list(true_labels).count(onevsall)
-        n_pbmc = N - n_cancer
+        y_data_pred = self.y_data_pred
+        n_samples = len(y_data_pred)
+        n_target = list(y_data_true).count(target)
+        n_pbmc = n_samples - n_target
 
-        for k in range(N):
-            Index_dict.setdefault(PARC_labels[k], []).append(true_labels[k])
+        for k in range(n_samples):
+            Index_dict.setdefault(y_data_pred[k], []).append(y_data_true[k])
         num_groups = len(Index_dict)
         sorted_keys = list(sorted(Index_dict.keys()))
         error_count = []
-        pbmc_labels = []
-        thp1_labels = []
+        negative_labels = []
+        positive_labels = []
         fp, fn, tp, tn, precision, recall, f1_score = 0, 0, 0, 0, 0, 0, 0
 
         for kk in sorted_keys:
             vals = [t for t in Index_dict[kk]]
-            majority_val = self.func_mode(vals)
-            if majority_val == onevsall: print('cluster', kk, ' has majority', onevsall, 'with population', len(vals))
+            majority_val = get_mode(vals)
+            if majority_val == target:
+                logger.info(f"Cluster {kk} has majority {target} with population {len(vals)}")
             if kk == -1:
                 len_unknown = len(vals)
-                print('len unknown', len_unknown)
-            if (majority_val == onevsall) and (kk != -1):
-                thp1_labels.append(kk)
-                fp = fp + len([e for e in vals if e != onevsall])
-                tp = tp + len([e for e in vals if e == onevsall])
+                logger.info(f"len unknown: {len_unknown}")
+            if (majority_val == target) and (kk != -1):
+                positive_labels.append(kk)
+                fp = fp + len([e for e in vals if e != target])
+                tp = tp + len([e for e in vals if e == target])
                 list_error = [e for e in vals if e != majority_val]
                 e_count = len(list_error)
                 error_count.append(e_count)
-            elif (majority_val != onevsall) and (kk != -1):
-                pbmc_labels.append(kk)
-                tn = tn + len([e for e in vals if e != onevsall])
-                fn = fn + len([e for e in vals if e == onevsall])
+            elif (majority_val != target) and (kk != -1):
+                negative_labels.append(kk)
+                tn = tn + len([e for e in vals if e != target])
+                fn = fn + len([e for e in vals if e == target])
                 error_count.append(len([e for e in vals if e != majority_val]))
 
-        predict_class_array = np.array(PARC_labels)
-        PARC_labels_array = np.array(PARC_labels)
-        number_clusters_for_target = len(thp1_labels)
-        for cancer_class in thp1_labels:
-            predict_class_array[PARC_labels_array == cancer_class] = 1
-        for benign_class in pbmc_labels:
-            predict_class_array[PARC_labels_array == benign_class] = 0
+        predict_class_array = np.array(y_data_pred)
+        y_data_pred_array = np.array(y_data_pred)
+        number_clusters_for_target = len(positive_labels)
+        for cancer_class in positive_labels:
+            predict_class_array[y_data_pred_array == cancer_class] = 1
+        for benign_class in negative_labels:
+            predict_class_array[y_data_pred_array == benign_class] = 0
         predict_class_array.reshape((predict_class_array.shape[0], -1))
-        error_rate = sum(error_count) / N
+        error_rate = sum(error_count) / n_samples
         n_target = tp + fn
         tnr = tn / n_pbmc
-        fnr = fn / n_cancer
-        tpr = tp / n_cancer
+        fnr = fn / n_target
+        tpr = tp / n_target
         fpr = fp / n_pbmc
 
-        if tp != 0 or fn != 0: recall = tp / (tp + fn)  # ability to find all positives
-        if tp != 0 or fp != 0: precision = tp / (tp + fp)  # ability to not misclassify negatives as positives
+        if tp != 0 or fn != 0:
+            recall = tp / (tp + fn)  # ability to find all positives
+        if tp != 0 or fp != 0:
+            precision = tp / (tp + fp)  # ability to not misclassify negatives as positives
         if precision != 0 or recall != 0:
             f1_score = precision * recall * 2 / (precision + recall)
-        majority_truth_labels = np.empty((len(true_labels), 1), dtype=object)
+        majority_truth_labels = np.empty((len(y_data_true), 1), dtype=object)
 
-        for cluster_i in set(PARC_labels):
-            cluster_i_loc = np.where(np.asarray(PARC_labels) == cluster_i)[0]
-            true_labels = np.asarray(true_labels)
-            majority_truth = self.func_mode(list(true_labels[cluster_i_loc]))
-            majority_truth_labels[cluster_i_loc] = majority_truth
+        for community_id in set(y_data_pred):
+            community_indices = np.where(np.asarray(y_data_pred) == community_id)[0]
+            y_data_true = np.asarray(y_data_true)
+            majority_truth = get_mode(list(y_data_true[community_indices]))
+            majority_truth_labels[community_indices] = majority_truth
 
         majority_truth_labels = list(majority_truth_labels.flatten())
         accuracy_val = [error_rate, f1_score, tnr, fnr, tpr, fpr, precision,
@@ -526,65 +967,82 @@ def accuracy(self, onevsall=1):
 
         return accuracy_val, predict_class_array, majority_truth_labels, number_clusters_for_target
 
-    def run_PARC(self):
-        print('input data has shape', self.data.shape[0], '(samples) x', self.data.shape[1], '(features)')
-        if self.true_label is None:
-            self.true_label = [1] * self.data.shape[0]
-        list_roc = []
-
-        time_start_total = time.time()
-
-        time_start_knn = time.time()
-
-        time_end_knn_struct = time.time() - time_start_knn
-        # Query dataset, k - number of closest elements (returns 2 numpy arrays)
-        self.run_subPARC()
-        run_time = time.time() - time_start_total
-        print('time elapsed {:.1f} seconds'.format(run_time))
+    def compute_performance_metrics(self, run_time: float):
+        """Compute performance metrics for the PARC algorithm.
 
-        targets = list(set(self.true_label))
-        N = len(list(self.true_label))
+        Args:
+            run_time: (float) the time taken to run the PARC algorithm.
+        """
+        list_roc = []
+        targets = list(set(self.y_data_true))
+        n_samples = len(list(self.y_data_true))
         self.f1_accumulated = 0
         self.f1_mean = 0
-        self.stats_df = pd.DataFrame({'jac_std_global': [self.jac_std_global], 'dist_std_local': [self.dist_std_local],
-                                      'runtime(s)': [run_time]})
+        self.stats_df = pd.DataFrame({
+            "jac_std_factor": [self.jac_std_factor],
+            "l2_std_factor": [self.l2_std_factor],
+            "runtime(s)": [run_time]
+        })
         self.majority_truth_labels = []
         if len(targets) > 1:
             f1_accumulated = 0
             f1_acc_noweighting = 0
-            for onevsall_val in targets:
-                print('target is', onevsall_val)
-                vals_roc, predict_class_array, majority_truth_labels, numclusters_targetval = self.accuracy(
-                    onevsall=onevsall_val)
+            for target in targets:
+                logger.info(f"Target is {target}")
+                vals_roc, predict_class_array, majority_truth_labels, numclusters_targetval = \
+                    self.accuracy(target=target)
                 f1_current = vals_roc[1]
-                print('target', onevsall_val, 'has f1-score of %.2f' % (f1_current * 100))
-                f1_accumulated = f1_accumulated + f1_current * (list(self.true_label).count(onevsall_val)) / N
+                logger.info(f"Target {target} has f1-score of {(f1_current * 100):.2f}")
+                f1_accumulated = f1_accumulated + \
+                    f1_current * (list(self.y_data_true).count(target)) / n_samples
                 f1_acc_noweighting = f1_acc_noweighting + f1_current
 
                 list_roc.append(
-                    [self.jac_std_global, self.dist_std_local, onevsall_val] + vals_roc + [numclusters_targetval] + [
-                        run_time])
+                    [self.jac_std_factor, self.l2_std_factor, target] +
+                    vals_roc +
+                    [numclusters_targetval] +
+                    [run_time]
+                )
 
             f1_mean = f1_acc_noweighting / len(targets)
-            print("f1-score (unweighted) mean %.2f" % (f1_mean * 100), '%')
-            print('f1-score weighted (by population) %.2f' % (f1_accumulated * 100), '%')
 
-            df_accuracy = pd.DataFrame(list_roc,
-                                       columns=['jac_std_global', 'dist_std_local', 'onevsall-target', 'error rate',
-                                                'f1-score', 'tnr', 'fnr',
-                                                'tpr', 'fpr', 'precision', 'recall', 'num_groups',
-                                                'population of target', 'num clusters', 'clustering runtime'])
+            df_accuracy = pd.DataFrame(
+                list_roc,
+                columns=[
+                    "jac_std_factor", "l2_std_factor", "target", "error rate",
+                    "f1-score", "tnr", "fnr", "tpr", "fpr", "precision", "recall", "num_groups",
+                    "target population", "num clusters", "clustering runtime"
+                ]
+            )
+
+            logger.message(f"f1-score (unweighted) mean: {(f1_mean * 100):.2f}")
+            logger.message(f"f1-score weighted (by population): {(f1_accumulated * 100):.2f}")
+            logger.message(
+                f"\n{df_accuracy[['target', 'f1-score', 'target population', 'num clusters']]}"
+            )
 
             self.f1_accumulated = f1_accumulated
             self.f1_mean = f1_mean
             self.stats_df = df_accuracy
             self.majority_truth_labels = majority_truth_labels
-        return
 
-    def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0,
-                      negative_sample_rate: int = 5, gamma: float = 1.0, spread=1.0, min_dist=0.1,
-                      n_epochs=0, init_pos="spectral", random_state_seed=1, densmap=False,
-                      densmap_kwds={}, output_dens=False):
+    def run_umap_hnsw(
+            self,
+            x_data,
+            graph,
+            n_components=2,
+            alpha: float = 1.0,
+            negative_sample_rate: int = 5,
+            gamma: float = 1.0,
+            spread=1.0,
+            min_dist=0.1,
+            n_epochs=0,
+            init_pos="spectral",
+            random_state_seed=1,
+            densmap=False,
+            densmap_kwds={},
+            output_dens=False
+    ):
         """Perform a fuzzy simplicial set embedding, using a specified initialisation method and
         then minimizing the fuzzy set cross entropy between the 1-skeletons of the high and
         low-dimensional fuzzy simplicial sets.
@@ -593,7 +1051,7 @@ def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0,
         <https://github.com/lmcinnes/umap/blob/master/umap/umap_.py>`__.
 
         Args:
-            X_input: (array) an array containing the input data, with shape n_samples x n_features.
+            x_data: (array) an array containing the input data, with shape n_samples x n_features.
             graph: (array) the 1-skeleton of the high dimensional fuzzy simplicial set as
                 represented by a graph for which we require a sparse matrix for the
                 (weighted) adjacency matrix.
@@ -633,10 +1091,10 @@ def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0,
         """
 
         a, b = find_ab_params(spread, min_dist)
-        print(f"a: {a}, b: {b}, spread: {spread}, dist: {min_dist}")
+        logger.message(f"a: {a}, b: {b}, spread: {spread}, dist: {min_dist}")
 
         X_umap = simplicial_set_embedding(
-            data=X_input,
+            data=x_data,
             graph=graph,
             n_components=n_components,
             initial_alpha=alpha,
@@ -655,3 +1113,76 @@ def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0,
             output_dens=output_dens
         )
         return X_umap[0]
+    
+    def load(self, file_path: pathlib.Path | str):
+        """Load a PARC object from a file.
+
+        Args:
+            file_path: The full path name of the JSON file to load the ``PARC`` object from.
+        """
+        logger.message(f"Loading PARC object from: {file_path}")
+
+        file_path = pathlib.Path(file_path).resolve()
+        if not os.path.isfile(file_path):
+            raise ValueError(f"{file_path} is not a valid file.")
+        
+        if file_path.suffix != ".json":
+            raise ValueError(f"file_path must have a .json extension, got {file_path.suffix}.")
+
+        with open(file_path, "r") as file:
+            model_dict = json.load(file)
+
+        self.knn = model_dict["knn"]
+        self.n_iter_leiden = model_dict["n_iter_leiden"]
+        self.random_seed = model_dict["random_seed"]
+        self.distance_metric = model_dict["distance_metric"]
+        self.n_threads = model_dict["n_threads"]
+        self.hnsw_param_ef_construction = model_dict["hnsw_param_ef_construction"]
+        self.l2_std_factor = model_dict["l2_std_factor"]
+        self.jac_threshold_type = model_dict["jac_threshold_type"]
+        self.jac_std_factor = model_dict["jac_std_factor"]
+        self.jac_weighted_edges = model_dict["jac_weighted_edges"]
+        self.do_prune_local = model_dict["do_prune_local"]
+        self.large_community_factor = model_dict["large_community_factor"]
+        self.small_community_size = model_dict["small_community_size"]
+        self.small_community_timeout = model_dict["small_community_timeout"]
+        self.resolution_parameter = model_dict["resolution_parameter"]
+        self.partition_type = model_dict["partition_type"]
+
+    def save(self, file_path: pathlib.Path | str):
+        """Save the PARC object to a file.
+
+        Args:
+            file_path: The full path name of the JSON file to save the ``PARC`` object to.
+        """
+
+        file_path = pathlib.Path(file_path).resolve()
+        if not os.path.isdir(file_path.parent):
+            raise ValueError(f"{file_path.parent} is not a valid directory.")
+        
+        if file_path.suffix != ".json":
+            raise ValueError(f"file_path must have a .json extension, got {file_path.suffix}.")
+        
+        model_dict = {
+            "knn": self.knn,
+            "n_iter_leiden": self.n_iter_leiden,
+            "random_seed": self.random_seed,
+            "distance_metric": self.distance_metric,
+            "n_threads": self.n_threads,
+            "hnsw_param_ef_construction": self.hnsw_param_ef_construction,
+            "l2_std_factor": self.l2_std_factor,
+            "jac_threshold_type": self.jac_threshold_type,
+            "jac_std_factor": self.jac_std_factor,
+            "jac_weighted_edges": self.jac_weighted_edges,
+            "do_prune_local": self.do_prune_local,
+            "large_community_factor": self.large_community_factor,
+            "small_community_size": self.small_community_size,
+            "small_community_timeout": self.small_community_timeout,
+            "resolution_parameter": self.resolution_parameter,
+            "partition_type": self.partition_type
+        }
+
+        with open(file_path, "w") as file:
+            json.dump(model_dict, file)
+
+        logger.message(f"PARC object saved to: {file_path}")
diff --git a/parc/logger.py b/parc/logger.py
new file mode 100644
index 0000000..0654545
--- /dev/null
+++ b/parc/logger.py
@@ -0,0 +1,95 @@
+"""Custom colored logger.
+
+See:
+
+`StackOverflow <http://stackoverflow.com/a/24956305/408556>`__
+`python-colors GitHub <https://gist.github.com/dideler/3814182>`__
+
+"""
+
+import logging
+import sys
+
+MIN_LEVEL = logging.DEBUG
+MESSAGE = 25
+logging.addLevelName(MESSAGE, "MESSAGE")
+LOGGING_LEVEL = 25
+
+
+class LogFilter(logging.Filter):
+    """Filters (lets through) all messages with level < LEVEL"""
+    def __init__(self, level):
+        self.level = level
+
+    def filter(self, record):
+        # "<" instead of "<=": since logger.setLevel is inclusive, this should
+        # be exclusive
+        return record.levelno < self.level
+
+
+class Logger(logging.Logger):
+    def message(self, msg, *args, **kwargs):
+        if self.isEnabledFor(MESSAGE):
+            self._log(MESSAGE, msg, args, **kwargs)
+
+
+class ColoredFormatter(logging.Formatter):
+    def __init__(self, fmt_prefix, fmt_msg):
+        self.use_color = self.supports_color()
+        self.fmt_prefix = fmt_prefix
+        self.fmt_msg = fmt_msg
+        super().__init__(fmt=f"{fmt_prefix} {fmt_msg}")
+
+    def format(self, record: logging.LogRecord) -> str:
+        if record.levelno == logging.WARNING:
+            if self.use_color:
+                fmt = f"\x1b[93m{self.fmt_prefix}\x1b[0m {self.fmt_msg}"
+                formatter = logging.Formatter(fmt)
+                return formatter.format(record)
+            else:
+                return super().format(record)
+        elif record.levelno == logging.ERROR:
+            if self.use_color:
+                fmt = f"\x1b[91m{self.fmt_prefix}\x1b[0m {self.fmt_msg}"
+                formatter = logging.Formatter(fmt)
+                return formatter.format(record)
+            else:
+                return super().format(record)
+        elif record.levelno == 25 or record.levelno == logging.INFO:
+            if self.use_color:
+                fmt = f"\x1b[96m{self.fmt_prefix}\x1b[0m {self.fmt_msg}"
+                formatter = logging.Formatter(fmt)
+                return formatter.format(record)
+            else:
+                return super().format(record)
+
+    def supports_color(self) -> bool:
+        """Check if the system supports ANSI color formatting."""
+        return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
+
+
+def get_logger(module_name, level: int = LOGGING_LEVEL) -> Logger:
+    logging.setLoggerClass(Logger)
+    stdout_handler = logging.StreamHandler(sys.stdout)
+    stderr_handler = logging.StreamHandler(sys.stderr)
+    stdout_handler.addFilter(LogFilter(logging.WARNING))
+    stdout_handler.setLevel(level)
+    if level == 25:
+        formatter = ColoredFormatter(
+            fmt_prefix="[%(levelname)s]:",
+            fmt_msg="%(message)s"
+        )
+    else:
+        formatter = ColoredFormatter(
+            fmt_prefix="[%(levelname)s] %(name)s.%(funcName)s (line %(lineno)d):",
+            fmt_msg="%(message)s"
+        )
+    stdout_handler.setFormatter(formatter)
+    stderr_handler.setLevel(max(MIN_LEVEL, logging.WARNING))
+    # messages lower than WARNING go to stdout
+    # messages >= WARNING (and >= STDOUT_LOG_LEVEL) go to stderr
+    logger = logging.getLogger(module_name)
+    logger.propagate = False
+    logger.handlers = [stdout_handler, stderr_handler]
+    logger.setLevel(level)
+    return logger
diff --git a/parc/utils.py b/parc/utils.py
new file mode 100644
index 0000000..65e5d35
--- /dev/null
+++ b/parc/utils.py
@@ -0,0 +1,13 @@
+def get_mode(a_list: list[any]) -> any:
+    """Get the value which appears most often in the list (the mode).
+
+    If multiple items are maximal, the function returns the first one encountered
+    (not necessarily the first one in the list).
+
+    Args:
+        a_list: A list with values.
+
+    Returns:
+        The most frequent value in the list
+    """
+    return max(set(a_list), key=a_list.count)
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..0426d55
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+log_cli = true
+log_cli_level = 25
+addopts = -rP
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..17ad205
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+pybind11
+numpy
+scipy
+pandas
+hnswlib
+igraph
+leidenalg>=0.7.0
+umap-learn
\ No newline at end of file
diff --git a/setup.py b/setup.py
index a38f07f..4831f3b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,25 +1,25 @@
 import setuptools
-
 from os import path
-this_directory = path.abspath(path.dirname(__file__))
-with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
+
+PARC_DIR = path.abspath(path.dirname(__file__))
+
+with open(path.join(PARC_DIR, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
 
 setuptools.setup(
-    name='parc',
-    version='0.40',  #May 28,2024
-    packages=['parc',],
-    license='MIT',
-    author_email='shobana.venkat88@gmail.com',
-    url='https://github.com/ShobiStassen/PARC',
-    setup_requires=['numpy', 'pybind11'],
+    name="parc",
+    version="0.40",
+    packages=["parc"],
+    license="MIT",
+    author_email="shobana.venkat88@gmail.com",
+    url="https://github.com/ShobiStassen/PARC",
+    setup_requires=["numpy", "pybind11"],
     install_requires=[
-        'pybind11', 'numpy', 'scipy', 'pandas', 'hnswlib', 'igraph',
-        'leidenalg>=0.7.0', 'umap-learn'
+        line.strip() for line in open("requirements.txt")
     ],
     extras_require={
         "dev": ["pytest", "scikit-learn"]
     },
     long_description=long_description,
-    long_description_content_type='text/markdown'
+    long_description_content_type="text/markdown"
 )
diff --git a/tests/test_parc.py b/tests/test_parc.py
index 5d6d523..c282fed 100644
--- a/tests/test_parc.py
+++ b/tests/test_parc.py
@@ -1,5 +1,38 @@
+import pytest
 from sklearn import datasets
+import igraph as ig
+import numpy as np
+import pathlib
+import json
+import scipy
+import igraph
+import hnswlib
 from parc._parc import PARC
+from parc.logger import get_logger
+from tests.variables import NEIGHBOR_ARRAY_L2, NEIGHBOR_ARRAY_COSINE
+from tests.utils import __tmp_dir__, create_tmp_dir, remove_tmp_dir
+
+logger = get_logger(__name__, 25)
+
+
+def setup_function():
+    create_tmp_dir()
+
+
+@pytest.fixture
+def iris_data():
+    iris = datasets.load_iris()
+    x_data = iris.data
+    y_data = iris.target
+    return x_data, y_data
+
+
+@pytest.fixture
+def forest_data():
+    forests = datasets.fetch_covtype()
+    x_data = forests.data[list(range(0, 30000)), :]
+    y_data = forests.target[list(range(0, 30000))]
+    return x_data, y_data
 
 
 def test_parc_run_umap_hnsw():
@@ -7,9 +40,385 @@ def test_parc_run_umap_hnsw():
     x_data = iris.data
     y_data = iris.target
 
-    parc_model = PARC(x_data, true_label=y_data)
-    parc_model.run_PARC()
+    parc_model = PARC(x_data, y_data_true=y_data)
+    parc_model.fit_predict()
 
-    graph = parc_model.knngraph_full()
+    graph = parc_model.create_knn_graph()
     x_umap = parc_model.run_umap_hnsw(x_data, graph)
     assert x_umap.shape == (150, 2)
+
+
+@pytest.mark.parametrize("knn", [5, 10, 20, 50])
+@pytest.mark.parametrize("jac_weighted_edges", [True, False])
+def test_parc_get_leiden_partition(iris_data, knn, jac_weighted_edges):
+    x_data = iris_data[0]
+    y_data = iris_data[1]
+
+    parc_model = PARC(x_data, y_data_true=y_data)
+    hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn)
+    neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn)
+    csr_array = parc_model.prune_local(neighbor_array, distance_array)
+
+    input_nodes, output_nodes = csr_array.nonzero()
+
+    edges = list(zip(input_nodes, output_nodes))
+
+    graph = ig.Graph(edges, edge_attrs={"weight": csr_array.data.tolist()})
+
+    leiden_partition = parc_model.get_leiden_partition(
+        graph=graph, jac_weighted_edges=jac_weighted_edges
+    )
+
+    assert len(leiden_partition.membership) == len(y_data)
+    assert len(leiden_partition) <= len(y_data)
+    assert len(leiden_partition) >= 1
+
+
+@pytest.mark.parametrize(
+    "knn_hnsw, knn_query, distance_metric, expected_neighbor_array",
+    [
+        (30, 2, "l2", NEIGHBOR_ARRAY_L2),
+        (30, 2, "cosine", NEIGHBOR_ARRAY_COSINE)
+    ]
+)
+def test_parc_create_hnsw_index(
+    iris_data, knn_hnsw, knn_query, distance_metric, expected_neighbor_array
+):
+    x_data = iris_data[0]
+    y_data = iris_data[1]
+    parc_model = PARC(x_data=x_data, y_data_true=y_data)
+    hnsw_index = parc_model.create_hnsw_index(
+        x_data=x_data,
+        knn=knn_hnsw,
+        distance_metric=distance_metric
+    )
+    assert isinstance(hnsw_index, hnswlib.Index)
+    neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn_query)
+    assert neighbor_array.shape == (x_data.shape[0], knn_query)
+    assert distance_array.shape == (x_data.shape[0], knn_query)
+    n_diff = np.count_nonzero(expected_neighbor_array != neighbor_array)
+    assert n_diff / x_data.shape[0] < 0.1
+
+
+@pytest.mark.parametrize("knn", [5, 10, 20, 50])
+def test_parc_create_knn_graph(iris_data, knn):
+    x_data = iris_data[0]
+    y_data = iris_data[1]
+
+    parc_model = PARC(x_data, y_data_true=y_data)
+    parc_model.hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn)
+    csr_array = parc_model.create_knn_graph(knn=knn)
+    nn_collection = np.split(csr_array.indices, csr_array.indptr)[1:-1]
+    assert len(nn_collection) == y_data.shape[0]
+
+
+@pytest.mark.parametrize(
+    "knn, l2_std_factor, n_edges",
+    [
+        (100, 3.0, 14625),
+        (100, 100.0, 14850),
+        (100, -100.0, 0)
+    ]
+)
+def test_parc_prune_local(
+    iris_data, knn, l2_std_factor, n_edges
+):
+    x_data = iris_data[0]
+    y_data = iris_data[1]
+    parc_model = PARC(x_data=x_data, y_data_true=y_data)
+    hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn)
+    neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn)
+    csr_array = parc_model.prune_local(neighbor_array, distance_array, l2_std_factor)
+    input_nodes, output_nodes = csr_array.nonzero()
+    edges = list(zip(input_nodes, output_nodes))
+    assert isinstance(csr_array, scipy.sparse.csr_matrix)
+    assert len(edges) == n_edges
+
+
+@pytest.mark.parametrize(
+    "knn, jac_threshold_type, jac_std_factor, jac_weighted_edges, n_edges",
+    [
+        (100, "median", 0.3, True, 3679),
+        (100, "mean", 0.3, False, 3865),
+        (100, "mean", -1000, False, 0),
+        (100, "mean", 1000, False, 8558)
+    ]
+)
+def test_parc_prune_global(
+    iris_data, knn, jac_threshold_type, jac_std_factor, jac_weighted_edges, n_edges
+):
+    x_data = iris_data[0]
+    y_data = iris_data[1]
+    parc_model = PARC(x_data=x_data, y_data_true=y_data)
+    hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn)
+    neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn)
+    csr_array = parc_model.prune_local(neighbor_array, distance_array)
+    graph_pruned = parc_model.prune_global(
+        csr_array=csr_array,
+        jac_threshold_type=jac_threshold_type,
+        jac_std_factor=jac_std_factor,
+        jac_weighted_edges=jac_weighted_edges,
+        n_samples=x_data.shape[0]
+    )
+    assert isinstance(graph_pruned, igraph.Graph)
+    assert graph_pruned.ecount() == n_edges
+    assert graph_pruned.vcount() == x_data.shape[0]
+
+
+@pytest.mark.parametrize(
+    (
+        "dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,"
+        "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,"
+        "large_community_factor, small_community_size, small_community_timeout,"
+        "resolution_parameter, partition_type,"
+        "f1_mean, f1_accumulated"
+    ),
+    [
+        (
+            "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0,
+            "ModularityVP", 0.9, 0.9
+        ),
+        (
+            "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.15, 10, 15, 1.0,
+            "ModularityVP", 0.9, 0.9
+        )
+    ]
+)
+@pytest.mark.parametrize(
+    "targets_exist",
+    [
+        (True),
+        (False)
+    ]
+)
+def test_parc_fit_predict_fast(
+    request, dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,
+    l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,
+    large_community_factor, small_community_size, small_community_timeout,
+    resolution_parameter, partition_type, f1_mean, f1_accumulated, targets_exist
+):
+    x_data, y_data = request.getfixturevalue(dataset_name)
+    if not targets_exist:
+        y_data = None
+
+    parc_model = PARC(
+        x_data=x_data,
+        y_data_true=y_data,
+        knn=knn,
+        n_iter_leiden=n_iter_leiden,
+        distance_metric=distance_metric,
+        hnsw_param_ef_construction=hnsw_param_ef_construction,
+        l2_std_factor=l2_std_factor,
+        jac_threshold_type=jac_threshold_type,
+        jac_std_factor=jac_std_factor,
+        jac_weighted_edges=jac_weighted_edges,
+        do_prune_local=do_prune_local,
+        large_community_factor=large_community_factor,
+        small_community_size=small_community_size,
+        small_community_timeout=small_community_timeout,
+        resolution_parameter=resolution_parameter,
+        partition_type=partition_type
+    )
+
+    parc_model.fit_predict()
+    if targets_exist:
+        assert parc_model.f1_mean >= f1_mean
+        assert parc_model.f1_accumulated >= f1_accumulated
+    else:
+        assert parc_model.f1_mean == 0
+        assert parc_model.f1_accumulated == 0
+    assert len(parc_model.y_data_pred) == x_data.shape[0]
+
+
+@pytest.mark.parametrize(
+    (
+        "dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,"
+        "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,"
+        "large_community_factor, small_community_size, small_community_timeout,"
+        "resolution_parameter, partition_type,"
+        "f1_mean, f1_accumulated"
+    ),
+    [
+        (
+            "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0,
+            "ModularityVP", 0.9, 0.9
+        ),
+        (
+            "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.15, 10, 15, 1.0,
+            "ModularityVP", 0.9, 0.9
+        ),
+        (
+            "iris_data", 2, 10, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0,
+            "ModularityVP", 0.9, 0.9
+        ),
+        (
+            "forest_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.019, 10, 15, 1.0,
+            "ModularityVP", 0.6, 0.7
+        ),
+        (
+            "forest_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0,
+            "ModularityVP", 0.6, 0.7
+        ),
+        (
+            "forest_data", 100, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0,
+            "ModularityVP", 0.5, 0.6
+        )
+    ]
+)
+@pytest.mark.parametrize(
+    "targets_exist",
+    [
+        (True),
+        (False)
+    ]
+)
+def test_parc_fit_predict_full(
+    request, dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,
+    l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,
+    large_community_factor, small_community_size, small_community_timeout,
+    resolution_parameter, partition_type, f1_mean, f1_accumulated, targets_exist
+):
+    x_data, y_data = request.getfixturevalue(dataset_name)
+    if not targets_exist:
+        y_data = None
+
+    parc_model = PARC(
+        x_data=x_data,
+        y_data_true=y_data,
+        knn=knn,
+        n_iter_leiden=n_iter_leiden,
+        distance_metric=distance_metric,
+        hnsw_param_ef_construction=hnsw_param_ef_construction,
+        l2_std_factor=l2_std_factor,
+        jac_threshold_type=jac_threshold_type,
+        jac_std_factor=jac_std_factor,
+        jac_weighted_edges=jac_weighted_edges,
+        do_prune_local=do_prune_local,
+        large_community_factor=large_community_factor,
+        small_community_size=small_community_size,
+        small_community_timeout=small_community_timeout,
+        resolution_parameter=resolution_parameter,
+        partition_type=partition_type
+    )
+
+    parc_model.fit_predict()
+    if targets_exist:
+        assert parc_model.f1_mean >= f1_mean
+        assert parc_model.f1_accumulated >= f1_accumulated
+    else:
+        assert parc_model.f1_mean == 0
+        assert parc_model.f1_accumulated == 0
+    assert len(parc_model.y_data_pred) == x_data.shape[0]
+
+
+@pytest.mark.parametrize(
+    (
+        "file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,"
+        "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,"
+        "large_community_factor, small_community_size, small_community_timeout,"
+        "resolution_parameter, partition_type"
+    ),
+    [
+        (
+            pathlib.Path(__tmp_dir__, "parc_model.json"), 30, 5, "l2", 150, 3.0,
+            "median", 0.15, True, True, 0.4, 10, 15, 1.0, "ModularityVP"
+        )
+    ]
+)
+def test_parc_save(
+    iris_data, file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,
+    l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,
+    large_community_factor, small_community_size, small_community_timeout,
+    resolution_parameter, partition_type
+):
+
+    x_data = iris_data[0]
+    y_data = iris_data[1]
+    parc_model = PARC(
+        x_data=x_data,
+        y_data_true=y_data,
+        knn=knn,
+        n_iter_leiden=n_iter_leiden,
+        distance_metric=distance_metric,
+        hnsw_param_ef_construction=hnsw_param_ef_construction,
+        l2_std_factor=l2_std_factor,
+        jac_threshold_type=jac_threshold_type,
+        jac_std_factor=jac_std_factor,
+        jac_weighted_edges=jac_weighted_edges,
+        do_prune_local=do_prune_local,
+        large_community_factor=large_community_factor,
+        small_community_size=small_community_size,
+        small_community_timeout=small_community_timeout,
+        resolution_parameter=resolution_parameter,
+        partition_type=partition_type
+    )
+    parc_model.fit_predict()
+    parc_model.save(file_path)
+    assert file_path.exists()
+    with open(file_path) as file:
+        model_dict = json.load(file)
+
+    for key, value in model_dict.items():
+        assert getattr(parc_model, key) == value
+
+
+@pytest.mark.parametrize(
+    (
+        "file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,"
+        "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,"
+        "large_community_factor, small_community_size, small_community_timeout,"
+        "resolution_parameter, partition_type"
+    ),
+    [
+        (
+            pathlib.Path(__tmp_dir__, "parc_model.json"), 30, 5, "l2", 150, 3.0,
+            "median", 0.15, True, True, 0.4, 10, 15, 1.0, "ModularityVP"
+        )
+    ]
+)
+def test_parc_load(
+    iris_data, file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction,
+    l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local,
+    large_community_factor, small_community_size, small_community_timeout,
+    resolution_parameter, partition_type
+):
+
+    x_data = iris_data[0]
+    y_data = iris_data[1]
+    parc_model = PARC(
+        x_data=x_data,
+        y_data_true=y_data,
+        knn=knn,
+        n_iter_leiden=n_iter_leiden,
+        distance_metric=distance_metric,
+        hnsw_param_ef_construction=hnsw_param_ef_construction,
+        l2_std_factor=l2_std_factor,
+        jac_threshold_type=jac_threshold_type,
+        jac_std_factor=jac_std_factor,
+        jac_weighted_edges=jac_weighted_edges,
+        do_prune_local=do_prune_local,
+        large_community_factor=large_community_factor,
+        small_community_size=small_community_size,
+        small_community_timeout=small_community_timeout,
+        resolution_parameter=resolution_parameter,
+        partition_type=partition_type
+    )
+    parc_model.fit_predict()
+    parc_model.save(file_path)
+
+    parc_model_saved = PARC(
+        x_data=x_data,
+        y_data_true=y_data,
+        file_path=file_path
+    )
+
+    with open(file_path) as file:
+        model_dict = json.load(file)
+
+    for key, value in model_dict.items():
+        assert getattr(parc_model_saved, key) == value
+
+
+
+def teardown_function():
+    remove_tmp_dir()
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..bc422ea
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,14 @@
+import pytest
+from parc.utils import get_mode
+
+
+@pytest.mark.parametrize(
+    "input_list, expected_mode",
+    [
+        ([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]),
+        ([1, 2, 3, 4, 2, 6], [2]),
+        (["a", "b", "c", "d", "e", "a", "b", "c"], ["a", "b", "c"])
+    ]
+)
+def test_get_mode(input_list, expected_mode):
+    assert get_mode(input_list) in expected_mode
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..affb6c0
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,25 @@
+import shutil
+import pathlib
+
+__path_parc__ = pathlib.Path(__file__).parents[1].absolute()
+__test_dir__ = pathlib.Path(__path_parc__, "tests")
+__tmp_dir__ = pathlib.Path(__path_parc__, "tmp")
+
+
+def create_tmp_dir():
+    """Create a temporary directory for testing.
+
+    1. Remove the ``tmp`` directory if it exists.
+    2. Create a new ``tmp`` directory.
+
+    Any data files created during testing will go into ``tmp`` directory.
+    This is created/removed for each test.
+
+    """
+    remove_tmp_dir()
+    pathlib.Path(__tmp_dir__).mkdir()
+
+
+def remove_tmp_dir():
+    """Recursively remove the ``tmp`` directory if it exists."""
+    shutil.rmtree(__tmp_dir__, ignore_errors=True)
\ No newline at end of file
diff --git a/tests/variables.py b/tests/variables.py
new file mode 100644
index 0000000..053e3bd
--- /dev/null
+++ b/tests/variables.py
@@ -0,0 +1,42 @@
+import numpy as np
+
+
+NEIGHBOR_ARRAY_L2 = np.array([
+    [0, 17], [1, 12], [2, 47], [3, 47], [4, 0], [5, 18], [6, 47], [7, 39], [8, 38], [9, 34],
+    [10, 48], [11, 7], [12, 1], [13, 38], [14, 33], [15, 33], [16, 10], [17, 0], [18, 5], [19, 21],
+    [20, 31], [21, 19], [22, 6], [23, 26], [24, 11], [25, 34], [26, 23], [27, 39], [28, 0],
+    [29, 30], [30, 34], [31, 20], [32, 46], [33, 32], [34, 9], [35, 49], [36, 10], [37, 4], [38, 8],
+    [39, 7], [40, 17], [41, 8], [42, 38], [43, 26], [44, 46], [45, 1], [46, 19], [47, 2], [48, 10],
+    [49, 7], [50, 52], [51, 56], [52, 50], [53, 89], [54, 58], [55, 66], [56, 51], [57, 93],
+    [58, 75], [59, 89], [60, 93], [61, 96], [62, 92], [63, 91], [64, 82], [65, 75], [66, 84],
+    [67, 92], [68, 87], [69, 80], [70, 138], [71, 97], [72, 133], [73, 63], [74, 97], [75, 65],
+    [76, 58], [77, 52], [78, 91], [79, 81], [80, 81], [81, 80], [82, 92], [83, 133], [84, 66],
+    [85, 56], [86, 52], [87, 68], [88, 96], [89, 53], [90, 94], [91, 63], [92, 82], [93, 57],
+    [94, 99], [95, 96], [96, 95], [97, 74], [98, 57], [99, 96], [100, 136], [101, 142], [102, 125],
+    [103, 137], [104, 132], [105, 122], [106, 84], [107, 130], [108, 128], [109, 143], [110, 147],
+    [111, 147], [112, 139], [113, 101], [114, 121], [115, 148], [116, 137], [117, 131], [118, 122],
+    [119, 72], [120, 143], [121, 142], [122, 105], [123, 126], [124, 120], [125, 129], [126, 123],
+    [127, 138], [128, 132], [129, 125], [130, 107], [131, 117], [132, 128], [133, 83], [134, 103],
+    [135, 130], [136, 148], [137, 116], [138, 127], [139, 112], [140, 144], [141, 145], [101, 142],
+    [143, 120], [144, 140], [145, 141], [146, 123], [147, 110], [148, 136], [149, 127]
+])
+
+NEIGHBOR_ARRAY_COSINE = np.array([
+    [0, 10], [1, 12], [2, 0], [3, 8], [4, 42], [5, 6], [6, 19], [7, 38], [8, 3], [9, 12],
+    [10, 0], [11, 29], [12, 9], [13, 0], [14, 13], [15, 33], [16, 40], [17, 2], [18, 39], [19, 6],
+    [20, 34], [21, 6], [22, 33], [23, 26], [24, 29], [25, 20], [26, 18], [27, 39], [28, 49],
+    [29, 3], [30, 8], [31, 45], [32, 33], [33, 15], [34, 20], [35, 36], [36, 35], [37, 4], [38, 7],
+    [39, 27], [40, 17], [41, 25], [42, 4], [43, 26], [44, 24], [45, 31], [46, 42], [47, 7],
+    [48, 10], [49, 28], [50, 74], [51, 99], [52, 86], [53, 54], [54, 52], [55, 63], [56, 59],
+    [57, 82], [58, 80], [59, 56], [60, 76], [61, 51], [62, 87], [63, 55], [64, 57], [65, 71],
+    [66, 84], [67, 69], [68, 87], [69, 58], [70, 84], [71, 65], [72, 129], [73, 90], [74, 50],
+    [75, 74], [76, 60], [77, 89], [78, 94], [79, 93], [80, 58], [81, 80], [82, 50], [83, 116],
+    [84, 66], [85, 56], [86, 97], [87, 60], [88, 95], [89, 52], [90, 55], [91, 94], [92, 69],
+    [93, 65], [94, 91], [95, 88], [96, 99], [97, 86], [98, 64], [99, 96], [100, 106], [101, 142],
+    [102, 111], [103, 83], [104, 142], [105, 108], [106, 121], [107, 105], [108, 122], [109, 115],
+    [110, 138], [111, 102], [112, 147], [113, 132], [114, 144], [115, 109], [116, 83], [117, 137],
+    [118, 122], [119, 130], [120, 140], [121, 109], [122, 108], [123, 126], [124, 149], [125, 83],
+    [126, 139], [127, 138], [128, 132], [129, 133], [130, 119], [131, 55], [132, 113], [133, 125],
+    [134, 107], [135, 146], [136, 148], [137, 117], [138, 127], [139, 126], [140, 120], [141, 145],
+    [142, 101], [143, 142], [144, 115], [145, 141], [146, 135], [147, 139], [148, 136], [149, 124]
+])