diff --git a/.github/workflows/tests_workflow.yml b/.github/workflows/tests_workflow.yml new file mode 100644 index 0000000..d0862b3 --- /dev/null +++ b/.github/workflows/tests_workflow.yml @@ -0,0 +1,27 @@ +name: tests_workflow + +# execute this workflow automatically when a we push to any branch +on: [push] + +jobs: + tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install dependencies + run: | + pip install pytest + - name: Install PARC + run: | + pip install . + - name: Run Python tests -x + run: | + pytest tests/ + continue-on-error: false + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true \ No newline at end of file diff --git a/.gitignore b/.gitignore index 8ff9a92..4fc9a7e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ env/ */__pycache__ *.egg-info +.eggs/ +.DS_Store diff --git a/README.md b/README.md index bd0a16b..6ba0d19 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ plt.scatter(X[:, 0], X[:, 1], c=parc_labels, cmap='rainbow') plt.show() # Run umap on the HNSW knngraph already built in PARC (more time and memory efficient for large datasets) -// Parc1.knn_struct = p1.make_knn_struct() // if you choose to visualize before running PARC clustering. then you need to include this line +// Parc1.hnsw_index = p1.create_hnsw_index() // if you choose to visualize before running PARC clustering. then you need to include this line graph = Parc1.knngraph_full() X_umap = Parc1.run_umap_hnsw(X, graph) plt.scatter(X_umap[:, 0], X_umap[:, 1], c=Parc1.labels) diff --git a/docs/Examples.rst b/docs/Examples.rst index a599521..d6bfe8e 100644 --- a/docs/Examples.rst +++ b/docs/Examples.rst @@ -31,7 +31,7 @@ Examples plt.show() # Run umap on the HNSW knngraph already built in PARC (more time and memory efficient for large datasets) - # If you choose to visualize before running PARC clustering. then you need to include this line: Parc1.knn_struct = p1.make_knn_struct() + # If you choose to visualize before running PARC clustering. then you need to include this line: Parc1.hnsw_index = p1.create_hnsw_index() graph = Parc1.knngraph_full() X_umap = Parc1.run_umap_hnsw(X, graph) plt.scatter(X_umap[:, 0], X_umap[:, 1], c=Parc1.labels) diff --git a/parc/_parc.py b/parc/_parc.py index b38f468..d8c8457 100644 --- a/parc/_parc.py +++ b/parc/_parc.py @@ -1,524 +1,965 @@ import numpy as np import pandas as pd import hnswlib +import os +import pathlib +import json from scipy.sparse import csr_matrix import igraph as ig import leidenalg import time from umap.umap_ import find_ab_params, simplicial_set_embedding +from parc.utils import get_mode +from parc.logger import get_logger + + +logger = get_logger(__name__) -#latest github upload 27-June-2020 class PARC: - def __init__(self, data, true_label=None, dist_std_local=3, jac_std_global='median', keep_all_local_dist='auto', - too_big_factor=0.4, small_pop=10, jac_weighted_edges=True, knn=30, n_iter_leiden=5, random_seed=42, - num_threads=-1, distance='l2', time_smallpop=15, partition_type = "ModularityVP", resolution_parameter = 1.0, - knn_struct=None, neighbor_graph=None, hnsw_param_ef_construction = 150): - # higher dist_std_local means more edges are kept - # highter jac_std_global means more edges are kept - if keep_all_local_dist == 'auto': - if data.shape[0] > 300000: - keep_all_local_dist = True # skips local pruning to increase speed + """``PARC``: ``P``henotyping by ``A``ccelerated ``R``efined ``C``ommunity-partitioning. + + Attributes: + knn: + The number of nearest neighbors k for the k-nearest neighbours algorithm. + Larger k means more neighbors in a cluster and therefore less clusters. + n_iter_leiden: + The number of iterations for the Leiden algorithm. + random_seed: + The random seed to enable reproducible Leiden clustering. + distance_metric: + The distance metric to be used in the KNN algorithm: + + * ``l2``: Euclidean distance L^2 norm: + + .. code-block:: python + + d = np.sum((x_i - y_i)**2) + * ``cosine``: cosine similarity: + + .. code-block:: python + + d = 1.0 - np.sum(x_i*y_i) / np.sqrt(sum(x_i*x_i) * np.sum(y_i*y_i)) + * ``ip``: inner product distance: + + .. code-block:: python + + d = 1.0 - np.sum(x_i*y_i) + n_threads: + The number of threads used in the KNN algorithm. + hnsw_param_ef_construction: + A higher value increases accuracy of index construction. + Even for O(100 000) cells, 150-200 is adequate. + neighbor_graph: + A sparse matrix with dimensions ``(n_samples, n_samples)``, containing the + distances between nodes. + hnsw_index: + The HNSW index of the KNN graph on which we perform queries. + l2_std_factor: + The multiplier used in calculating the Euclidean distance threshold for the distance + between two nodes during local pruning: + + .. code-block:: python + + max_distance = np.mean(distances) + l2_std_factor * np.std(distances) + + Avoid setting both the ``jac_std_factor`` (global) and the ``l2_std_factor`` (local) + to < 0.5 as this is very aggressive pruning. + Higher ``l2_std_factor`` means more edges are kept. + do_prune_local: + Whether or not to do local pruning. If ``None`` (default), set to ``False`` if the + number of samples is > 300 000, and set to ``True`` otherwise. + jac_threshold_type: + One of ``"median"`` or ``"mean"``. Determines how the Jaccard similarity threshold is + calculated during global pruning. + jac_std_factor: + The multiplier used in calculating the Jaccard similarity threshold for the similarity + between two nodes during global pruning for ``jac_threshold_type = "mean"``: + + .. code-block:: python + + threshold = np.mean(similarities) - jac_std_factor * np.std(similarities) + + Setting ``jac_std_factor = 0.15`` and ``jac_threshold_type="mean"`` performs empirically + similar to ``jac_threshold_type="median"``, which does not use the ``jac_std_factor``. + Generally values between 0-1.5 are reasonable. Higher ``jac_std_factor`` means more + edges are kept. + jac_weighted_edges: + Whether to partition using the weighted graph. + resolution_parameter: + The resolution parameter to be used in the Leiden algorithm. + In order to change ``resolution_parameter``, we switch to ``RBVP``. + partition_type: + The partition type to be used in the Leiden algorithm: + + * ``ModularityVP``: ModularityVertexPartition, ``resolution_parameter=1`` + * ``RBVP``: RBConfigurationVP, Reichardt and Bornholdtā€™s Potts model. Note that this + is the same as ``ModularityVP`` when setting š¯›¾ = 1 and normalising by 2m. + large_community_factor: + A factor used to determine if a community is too large. + If the community size is greater than ``large_community_factor * n_samples``, + then the community is too large and the ``PARC`` algorithm will be run on the single + community to split it up. The default value of ``0.4`` ensures that all communities + will be less than the cutoff size. + small_community_size: + The smallest population size to be considered a community. + small_community_timeout: + The maximum number of seconds trying to check an outlying small community. + """ + def __init__( + self, + x_data: np.ndarray | pd.DataFrame, + y_data_true: np.ndarray | pd.Series | list[int] | None = None, + file_path: pathlib.Path | str | None = None, + knn: int = 30, + n_iter_leiden: int = 5, + random_seed: int = 42, + distance_metric: str = "l2", + n_threads: int = -1, + hnsw_param_ef_construction: int = 150, + neighbor_graph: csr_matrix | None = None, + hnsw_index: hnswlib.Index | None = None, + l2_std_factor: float = 3, + jac_threshold_type: str = "median", + jac_std_factor: float = 0.15, + jac_weighted_edges: bool = True, + do_prune_local: bool | None = None, + large_community_factor: float = 0.4, + small_community_size: int = 10, + small_community_timeout: float = 15, + resolution_parameter: float = 1.0, + partition_type: str = "ModularityVP" + ): + self.x_data = x_data + self.y_data_true = y_data_true + self.y_data_pred = None + + if file_path is not None: + self.load(file_path) + else: + self.knn = knn + self.n_iter_leiden = n_iter_leiden + self.random_seed = random_seed + self.distance_metric = distance_metric + self.n_threads = n_threads + self.hnsw_param_ef_construction = hnsw_param_ef_construction + self.neighbor_graph = neighbor_graph + self.hnsw_index = hnsw_index + self.l2_std_factor = l2_std_factor + self.jac_threshold_type = jac_threshold_type + self.jac_std_factor = jac_std_factor + self.jac_weighted_edges = jac_weighted_edges + self.do_prune_local = do_prune_local + self.large_community_factor = large_community_factor + self.small_community_size = small_community_size + self.small_community_timeout = small_community_timeout + self.resolution_parameter = resolution_parameter + self.partition_type = partition_type + + @property + def x_data(self) -> np.ndarray: + """An array of the input x data, with dimensions ``(n_samples, n_features)``.""" + return self._x_data + + @x_data.setter + def x_data(self, x_data: np.ndarray | pd.DataFrame): + if isinstance(x_data, pd.DataFrame): + x_data = x_data.to_numpy() + self._x_data = x_data + + @property + def y_data_true(self) -> np.ndarray: + """An array of the true output y labels, with dimensions ``(n_samples, 1)``.""" + return self._y_data_true + + @y_data_true.setter + def y_data_true(self, y_data_true: np.ndarray | pd.Series | list[int] | None): + if y_data_true is None: + y_data_true = np.array([1] * self.x_data.shape[0]) + elif isinstance(y_data_true, pd.Series): + y_data_true = y_data_true.to_numpy() + elif isinstance(y_data_true, list): + y_data_true = np.array(y_data_true) + self._y_data_true = y_data_true + + @property + def y_data_pred(self) -> np.ndarray | None: + """An array of the predicted output y labels, with dimensions ``(n_samples, 1)``.""" + return self._y_data_pred + + @y_data_pred.setter + def y_data_pred(self, y_data_pred: np.ndarray | pd.Series | list[int] | None): + if isinstance(y_data_pred, pd.Series): + y_data_pred = y_data_pred.to_numpy() + elif isinstance(y_data_pred, list): + y_data_pred = np.array(y_data_pred) + self._y_data_pred = y_data_pred + + @property + def do_prune_local(self) -> bool: + return self._do_prune_local + + @do_prune_local.setter + def do_prune_local(self, do_prune_local: bool | None): + if do_prune_local is None: + if self.x_data.shape[0] > 300000: + logger.message( + f"Sample size is {self.x_data.shape[0]}, setting do_prune_local " + f"to False so that local pruning will be skipped and algorithm will be faster." + ) + do_prune_local = False else: - keep_all_local_dist = False - if resolution_parameter !=1: - partition_type = "RBVP" # Reichardt and Bornholdtā€™s Potts model. Note that this is the same as ModularityVertexPartition when setting š¯›¾ = 1 and normalising by 2m - self.data = data - self.true_label = true_label - self.dist_std_local = dist_std_local # similar to the jac_std_global parameter. avoid setting local and global pruning to both be below 0.5 as this is very aggresive pruning. - self.jac_std_global = jac_std_global #0.15 is also a recommended value performing empirically similar to 'median'. Generally values between 0-1.5 are reasonable. - self.keep_all_local_dist = keep_all_local_dist #decides whether or not to do local pruning. default is 'auto' which omits LOCAL pruning for samples >300,000 cells. - self.too_big_factor = too_big_factor #if a cluster exceeds this share of the entire cell population, then the PARC will be run on the large cluster. at 0.4 it does not come into play - self.small_pop = small_pop # smallest cluster population to be considered a community - self.jac_weighted_edges = jac_weighted_edges #boolean. whether to partition using weighted graph - self.knn = knn - self.n_iter_leiden = n_iter_leiden #the default is 5 in PARC - self.random_seed = random_seed # enable reproducible Leiden clustering - self.num_threads = num_threads # number of threads used in KNN search/construction - self.distance = distance # Euclidean distance 'l2' by default; other options 'ip' and 'cosine' - self.time_smallpop = time_smallpop #number of seconds trying to check an outlier - self.partition_type = partition_type #default is the simple ModularityVertexPartition where resolution_parameter =1. In order to change resolution_parameter, we switch to RBConfigurationVP - self.resolution_parameter = resolution_parameter # defaults to 1. expose this parameter in leidenalg - self.knn_struct = knn_struct #the hnsw index of the KNN graph on which we perform queries - self.neighbor_graph = neighbor_graph # CSR affinity matrix for pre-computed nearest neighbors - self.hnsw_param_ef_construction = hnsw_param_ef_construction #set at 150. higher value increases accuracy of index construction. Even for several 100,000s of cells 150-200 is adequate - - def make_knn_struct(self, too_big=False, big_cluster=None): - if self.knn > 190: print('consider using a lower K_in for KNN graph construction') - ef_query = max(100, self.knn + 1) # ef always should be >K. higher ef, more accurate query - if too_big == False: - num_dims = self.data.shape[1] - n_elements = self.data.shape[0] - p = hnswlib.Index(space=self.distance, dim=num_dims) # default to Euclidean distance - p.set_num_threads(self.num_threads) # allow user to set threads used in KNN construction - if n_elements < 10000: - ef_query = min(n_elements - 10, 500) - ef_construction = ef_query + do_prune_local = True + + self._do_prune_local = do_prune_local + + @property + def partition_type(self) -> str: + return self._partition_type + + @partition_type.setter + def partition_type(self, partition_type: str): + if self.resolution_parameter != 1: + self._partition_type = "RBVP" + else: + self._partition_type = partition_type + + @property + def community_counts(self) -> pd.DataFrame: + """A dataframe containing the community counts. + + Returns: + A dataframe with the following columns: + + * ``community_id``: The community ID. + * ``count``: The number of samples in the community. + """ + return self._community_counts + + @community_counts.setter + def community_counts(self, community_counts: pd.DataFrame): + self._community_counts = community_counts + + def create_hnsw_index( + self, + x_data: np.ndarray, + knn: int, + ef_query: int | None = None, + hnsw_param_m: int | None = None, + hnsw_param_ef_construction: int | None = None, + distance_metric: str = "l2", + n_threads: int | None = None + ) -> hnswlib.Index: + """Create a KNN graph using the Hierarchical Navigable Small Worlds (HNSW) algorithm. + + See `hnswlib.Index + `__. + + Args: + x_data: + An array of the input x data, with dimensions ``(n_samples, n_features)``. + knn: + The number of nearest neighbors k for the k-nearest neighbors algorithm. + ef_query: + The ``ef_query`` parameter corresponds to the ``hnswlib.Index`` parameter ``ef``. + It determines the size of the dynamic list for the nearest neighbors + (used during the search). Higher ``ef`` leads to more accurate but slower search. + Must be a value in the interval ``(k, n_samples]``. + hnsw_param_m: + The ``hnsw_param_m`` parameter corresponds to the ``hnswlib.Index`` parameter ``M``. + It corresponds to the number of bi-directional links created for every new element + during the ``hnswlib.Index`` construction. Reasonable range for ``M`` is ``2-100``. + Higher ``M`` works better on datasets with high intrinsic dimensionality and/or + high recall, while lower ``M`` works better for datasets with low intrinsic + dimensionality and/or low recall. The parameter also determines the algorithm's + memory consumption, which is roughly ``M * 8-10 bytes`` per stored element. + + For example, for ``n_features=4`` random vectors, the optimal ``M`` for search + is somewhere around ``6``, while for high dimensional datasets + (word embeddings, good face descriptors, scRNA seq), higher values of ``M`` + are required (e.g. ``M=48-64``) for optimal performance at high recall. + The range ``M=12-48`` is adequate for the most of the use cases. + When ``M`` is changed, one has to update the other parameters. + Nonetheless, ``ef`` and ``ef_construction`` parameters can be roughly estimated + by assuming that ``M*ef_construction`` is a constant. + hnsw_param_ef_construction: + The ``hnsw_param_ef_construction`` parameter corresponds to the ``hnswlib.Index`` + parameter ``ef_construction``. It has the same meaning as ``ef_query``, + but controls the index_time/index_accuracy. Higher values lead to longer + construction, but better index quality. Even for ``O(100 000)`` cells, + ``ef_construction ~ 150-200`` is adequate. + + At some point, increasing ``ef_construction`` does not improve the quality of + the index. One way to check if the selection of ``ef_construction`` is + appropriate is to measure a recall for ``M`` nearest neighbor search when + ``ef = ef_construction``: if the recall is lower than ``0.9``, then there is room + for improvement. + distance_metric: + The distance metric to be used in the KNN algorithm: + + * ``l2``: Euclidean distance L^2 norm: + + .. code-block:: python + + d = np.sum((x_i - y_i)**2) + + * ``cosine``: cosine similarity + + .. code-block:: python + + d = 1.0 - np.sum(x_i*y_i) / np.sqrt(np.sum(x_i*x_i) * np.sum(y_i*y_i)) + + n_threads: + The number of threads used in the KNN algorithm. + + Returns: + The HNSW index of the k-nearest neighbors graph. + """ + + n_features = x_data.shape[1] + n_samples = x_data.shape[0] + + hnsw_index = hnswlib.Index(space=distance_metric, dim=n_features) + + if knn > 190: + logger.message( + f"knn is {knn}, consider using a lower K_in for KNN graph construction" + ) + + if ef_query is None: + if n_samples < 10000: + ef_query = min(n_samples - 10, 500) else: - ef_construction = self.hnsw_param_ef_construction - if (num_dims > 30) & (n_elements<=50000) : - p.init_index(max_elements=n_elements, ef_construction=ef_construction, - M=48) ## good for scRNA seq where dimensionality is high + ef_query = 100 + + ef_query = min(max(ef_query, knn + 1), n_samples) + + if n_threads is not None: + hnsw_index.set_num_threads(n_threads) + + if hnsw_param_m is None: + if n_features > 30 and n_samples <= 50000: + hnsw_param_m = 48 else: - p.init_index(max_elements=n_elements, ef_construction=ef_construction, M=24 ) #30 - p.add_items(self.data) - if too_big == True: - num_dims = big_cluster.shape[1] - n_elements = big_cluster.shape[0] - p = hnswlib.Index(space='l2', dim=num_dims) - p.init_index(max_elements=n_elements, ef_construction=200, M=30) - p.add_items(big_cluster) - p.set_ef(ef_query) # ef should always be > k - - return p - - def knngraph_full(self):#, neighbor_array, distance_array): - k_umap = 15 - t0= time.time() + hnsw_param_m = 24 + + if hnsw_param_ef_construction is None: + if n_samples < 10000: + hnsw_param_ef_construction = min(n_samples - 10, 500) + else: + hnsw_param_ef_construction = self.hnsw_param_ef_construction + + hnsw_index.init_index( + max_elements=n_samples, + ef_construction=hnsw_param_ef_construction, + M=hnsw_param_m + ) + hnsw_index.add_items(x_data) + hnsw_index.set_ef(ef_query) + + return hnsw_index + + def create_knn_graph(self, knn: int = 15) -> csr_matrix: + """Create a full k-nearest neighbors graph using the HNSW algorithm. + + Args: + knn: The number of nearest neighbors k for the k-nearest neighbours algorithm. + + Returns: + A compressed sparse row matrix with dimensions ``(n_samples, n_samples)``, + containing the pruned distances. + """ + # neighbors in array are not listed in in any order of proximity - self.knn_struct.set_ef(k_umap+1) - neighbor_array, distance_array = self.knn_struct.knn_query(self.data, k=k_umap) + self.hnsw_index.set_ef(knn + 1) + neighbor_array, distance_array = self.hnsw_index.knn_query(self.x_data, k=knn) row_list = [] n_neighbors = neighbor_array.shape[1] - n_cells = neighbor_array.shape[0] - - row_list.extend(list(np.transpose(np.ones((n_neighbors, n_cells)) * range(0, n_cells)).flatten())) + n_samples = neighbor_array.shape[0] + row_list.extend( + list(np.transpose(np.ones((n_neighbors, n_samples)) * range(0, n_samples)).flatten()) + ) row_min = np.min(distance_array, axis=1) row_sigma = np.std(distance_array, axis=1) - distance_array = (distance_array - row_min[:,np.newaxis])/row_sigma[:,np.newaxis] + distance_array = (distance_array - row_min[:, np.newaxis]) / row_sigma[:, np.newaxis] + distance_array = np.sqrt(distance_array.flatten()) * -1 col_list = neighbor_array.flatten().tolist() - distance_array = distance_array.flatten() - distance_array = np.sqrt(distance_array) - distance_array = distance_array * -1 weight_list = np.exp(distance_array) + threshold = np.mean(weight_list) + 2 * np.std(weight_list) + weight_list[weight_list >= threshold] = threshold + csr_array = csr_matrix( + (weight_list, (np.array(row_list), np.array(col_list))), + shape=(n_samples, n_samples) + ) + prod_matrix = csr_array.multiply(csr_array.T) + csr_array = csr_array.T + csr_array - prod_matrix + return csr_array - threshold = np.mean(weight_list) + 2* np.std(weight_list) + def prune_local( + self, + neighbor_array: np.ndarray, + distance_array: np.ndarray, + l2_std_factor: float | None = None + ) -> csr_matrix: + """Prune the nearest neighbors array. - weight_list[weight_list >= threshold] = threshold + If ``do_prune_local==True``, remove any neighbors which are further away than + the specified cutoff distance. Also, remove any self-loops. Return in the ``csr_matrix`` + format. - weight_list = weight_list.tolist() + If ``do_prune_local==False``, then don't perform any pruning and return the original + arrays in the ``csr_matrix`` format. + Args: + neighbor_array: An array with dimensions ``(n_samples, k)`` listing the + k nearest neighbors for each data point. - graph = csr_matrix((np.array(weight_list), (np.array(row_list), np.array(col_list))), - shape=(n_cells, n_cells)) + .. note:: + The neighbors in the array are not listed in any order of proximity. - graph_transpose = graph.T - prod_matrix = graph.multiply(graph_transpose) + distance_array: An array with dimensions ``(n_samples, k)`` listing the + distances to each of the k nearest neighbors for each data point. + l2_std_factor: The multiplier used in calculating the Euclidean distance threshold + for the distance between two nodes during local pruning. If ``None`` (default), + then the value is set to the value of ``self.l2_std_factor``. - graph = graph_transpose + graph - prod_matrix - return graph + Returns: + A compressed sparse row matrix with dimensions ``(n_samples, n_samples)``, + containing the pruned distances. + """ - def make_csrmatrix_noselfloop(self, neighbor_array, distance_array): - # neighbor array not listed in in any order of proximity row_list = [] col_list = [] weight_list = [] n_neighbors = neighbor_array.shape[1] - n_cells = neighbor_array.shape[0] - rowi = 0 - discard_count = 0 - if self.keep_all_local_dist == False: # locally prune based on (squared) l2 distance + n_samples = neighbor_array.shape[0] - print('commencing local pruning based on Euclidean distance metric at', - self.dist_std_local, 's.dev above mean') + if l2_std_factor is None: + l2_std_factor = self.l2_std_factor + else: + self.l2_std_factor = l2_std_factor + + if self.do_prune_local: + logger.message( + "Starting local pruning based on Euclidean distance metric at " + f"{self.l2_std_factor} standard deviations above the mean" + ) distance_array = distance_array + 0.1 - for row in neighbor_array: - distlist = distance_array[rowi, :] - to_keep = np.where(distlist < np.mean(distlist) + self.dist_std_local * np.std(distlist))[0] # 0*std - updated_nn_ind = row[np.ix_(to_keep)] - updated_nn_weights = distlist[np.ix_(to_keep)] - discard_count = discard_count + (n_neighbors - len(to_keep)) - - for ik in range(len(updated_nn_ind)): - if rowi != row[ik]: # remove self-loops - row_list.append(rowi) - col_list.append(updated_nn_ind[ik]) - dist = np.sqrt(updated_nn_weights[ik]) - weight_list.append(1/(dist+0.1)) - - rowi = rowi + 1 - - if self.keep_all_local_dist == True: # dont prune based on distance - row_list.extend(list(np.transpose(np.ones((n_neighbors, n_cells)) * range(0, n_cells)).flatten())) + for community_id, neighbors in zip(range(n_samples), neighbor_array): + distances = distance_array[community_id, :] + max_distance = np.mean(distances) + self.l2_std_factor * np.std(distances) + to_keep = np.where(distances < max_distance)[0] + updated_neighbors = neighbors[np.ix_(to_keep)] + updated_distances = distances[np.ix_(to_keep)] + + # remove self-loops + for index in range(len(updated_neighbors)): + if community_id != neighbors[index]: + row_list.append(community_id) + col_list.append(updated_neighbors[index]) + distance = np.sqrt(updated_distances[index]) + weight_list.append(1 / (distance + 0.1)) + else: + row_list.extend( + list(np.transpose(np.ones((n_neighbors, n_samples)) * range(n_samples)).flatten()) + ) col_list = neighbor_array.flatten().tolist() - weight_list = (1. / (distance_array.flatten() + 0.1)).tolist() - - csr_graph = csr_matrix((np.array(weight_list), (np.array(row_list), np.array(col_list))), - shape=(n_cells, n_cells)) - return csr_graph - - def func_mode(self, ll): # return MODE of list - # If multiple items are maximal, the function returns the first one encountered. - return max(set(ll), key=ll.count) - - def run_toobig_subPARC(self, X_data, jac_std_toobig=0.3, - jac_weighted_edges=True): - n_elements = X_data.shape[0] - hnsw = self.make_knn_struct(too_big=True, big_cluster=X_data) - if n_elements <= 10: print('consider increasing the too_big_factor') - if n_elements > self.knn: - knnbig = self.knn + weight_list = (1.0 / (distance_array.flatten() + 0.1)).tolist() + + csr_array = csr_matrix( + (np.array(weight_list), (np.array(row_list), np.array(col_list))), + shape=(n_samples, n_samples) + ) + return csr_array + + def prune_global( + self, + csr_array: csr_matrix, + jac_threshold_type: str, + jac_std_factor: float, + jac_weighted_edges: bool, + n_samples: int + ) -> ig.Graph: + """Prune the graph globally based on the Jaccard similarity measure. + + The ``csr_array`` contains the locally-pruned pairwise distances. From this, we can + use the Jaccard similarity metric to compute the similarity score for each edge. We then + remove any edges from the graph that do not meet a minimum similarity threshold. + + Args: + csr_array: A sparse matrix with dimensions ``(n_samples, n_samples)``, + containing the locally-pruned pair-wise distances. + jac_threshold_type: One of ``"median"`` or ``"mean"``. Determines how the + Jaccard similarity threshold is calculated during global pruning. + jac_std_factor: The multiplier used in calculating the Jaccard similarity + threshold for the similarity between two nodes during global pruning for + ``jac_threshold_type = "mean"``: + + .. code-block:: python + + threshold = np.mean(similarities) - jac_std_factor * np.std(similarities) + + Setting ``jac_std_factor = 0.15`` and ``jac_threshold_type="mean"`` performs + empirically similar to ``jac_threshold_type="median"``, which does not use the + ``jac_std_factor``. Generally values between 0-1.5 are reasonable. Higher + ``jac_std_factor`` means more edges are kept. + jac_weighted_edges: Whether to weight the pruned graph. This is always ``True`` for + the top-level ``PARC`` run, but can be changed when pruning the large communities. + n_samples: The number of samples in the data. + + Returns: + A ``Graph`` object which has now been locally and globally pruned. + """ + + logger.message("Starting global pruning...") + + input_nodes, output_nodes = csr_array.nonzero() + edges = list(zip(input_nodes.tolist(), output_nodes.tolist())) + edges_copy = edges.copy() + + logger.info(f"Creating graph with {len(edges)} edges and {n_samples} nodes...") + + graph = ig.Graph(edges, edge_attrs={"weight": csr_array.data.tolist()}) + similarities = np.asarray(graph.similarity_jaccard(pairs=edges_copy)) + + if jac_threshold_type == "median": + threshold = np.median(similarities) + else: + threshold = np.mean(similarities) - jac_std_factor * np.std(similarities) + + indices_similar = np.where(similarities > threshold)[0] + + logger.info( + f"Pruning {len(edges) - len(indices_similar)} edges based on Jaccard similarity " + f"threshold of {threshold:.3f} " + f"(mean = {np.mean(similarities):.3f}, std = {np.std(similarities):.3f}) " + ) + logger.message(f"Creating graph with {len(indices_similar)} edges and {n_samples} nodes...") + + if jac_weighted_edges: + graph_pruned = ig.Graph( + n=n_samples, + edges=list(np.asarray(edges_copy)[indices_similar]), + edge_attrs={"weight": list(similarities[indices_similar])} + ) else: - knnbig = int(max(5, 0.2 * n_elements)) - - neighbor_array, distance_array = hnsw.knn_query(X_data, k=knnbig) - # print('shapes of neigh and dist array', neighbor_array.shape, distance_array.shape) - csr_array = self.make_csrmatrix_noselfloop(neighbor_array, distance_array) - sources, targets = csr_array.nonzero() - #mask = np.zeros(len(sources), dtype=bool) - - #mask |= (csr_array.data < (np.mean(csr_array.data) - np.std(csr_array.data) * 5)) # weights are 1/dist so bigger weight means closer nodes - - #csr_array.data[mask] = 0 - #csr_array.eliminate_zeros() - #sources, targets = csr_array.nonzero() - edgelist = list(zip(sources.tolist(), targets.tolist())) - edgelist_copy = edgelist.copy() - G = ig.Graph(edgelist, edge_attrs={'weight': csr_array.data.tolist()}) - sim_list = G.similarity_jaccard(pairs=edgelist_copy) # list of jaccard weights - new_edgelist = [] - sim_list_array = np.asarray(sim_list) - if jac_std_toobig == 'median': - threshold = np.median(sim_list) + graph_pruned = ig.Graph( + n=n_samples, + edges=list(np.asarray(edges_copy)[indices_similar]) + ) + + graph_pruned.simplify(combine_edges="sum") # "first" + return graph_pruned + + def get_leiden_partition( + self, + graph: ig.Graph, + jac_weighted_edges: bool = True + ) -> leidenalg.VertexPartition: + """Partition the graph using the Leiden algorithm. + + A partition is a set of communities. + + Args: + graph: A ``Graph`` object which has been locally and globally pruned. + jac_weighted_edges: Whether to partition using the weighted graph. + + Returns: + A partition object. + See `leidenalg.VertexPartition on GitHub + `_. + """ + + if jac_weighted_edges: + weights = "weight" else: - threshold = np.mean(sim_list) - jac_std_toobig * np.std(sim_list) - print('jac threshold %.3f' % threshold) - print('jac std %.3f' % np.std(sim_list)) - print('jac mean %.3f' % np.mean(sim_list)) - strong_locs = np.where(sim_list_array > threshold)[0] - for ii in strong_locs: new_edgelist.append(edgelist_copy[ii]) - sim_list_new = list(sim_list_array[strong_locs]) - - if jac_weighted_edges == True: - G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist), edge_attrs={'weight': sim_list_new}) + weights = None + + if self.partition_type == "ModularityVP": + logger.message( + "Leiden algorithm find partition: partition type = ModularityVertexPartition" + ) + partition = leidenalg.find_partition( + graph=graph, + partition_type=leidenalg.ModularityVertexPartition, + weights=weights, + n_iterations=self.n_iter_leiden, + seed=self.random_seed + ) else: - G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist)) - G_sim.simplify(combine_edges='sum') - if jac_weighted_edges == True: - if self.partition_type =='ModularityVP': - partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, weights='weight', - n_iterations=self.n_iter_leiden, seed=self.random_seed) - print('partition type MVP') - else: - partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, weights='weight', - n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter=self.resolution_parameter) - print('partition type RBC') + logger.message( + "Leiden algorithm find partition: partition type = RBConfigurationVertexPartition" + ) + partition = leidenalg.find_partition( + graph=graph, + partition_type=leidenalg.RBConfigurationVertexPartition, + weights=weights, + n_iterations=self.n_iter_leiden, + seed=self.random_seed, + resolution_parameter=self.resolution_parameter + ) + return partition + + def run_toobig_subPARC( + self, + x_data, + jac_threshold_type: str = "mean", + jac_std_factor: float = 0.3, + jac_weighted_edges=True + ): + + n_samples = x_data.shape[0] + hnsw_index = self.create_hnsw_index( + x_data=x_data, + knn=self.knn, + hnsw_param_m=30, + hnsw_param_ef_construction=200, + distance_metric="l2", + ef_query=100 + ) + if n_samples <= 10: + logger.message( + f"Large community is small with only {n_samples} nodes. " + f"Consider increasing the large_community_factor = {self.large_community_factor}." + ) + if n_samples > self.knn: + knnbig = self.knn else: - if self.partition_type == 'ModularityVP': - print('partition type MVP') - partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, - n_iterations=self.n_iter_leiden, seed=self.random_seed) - else: - print('partition type RBC') - partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, - n_iterations=self.n_iter_leiden, seed=self.random_seed, - resolution_parameter=self.resolution_parameter) - # print('Q= %.2f' % partition.quality()) - PARC_labels_leiden = np.asarray(partition.membership) - PARC_labels_leiden = np.reshape(PARC_labels_leiden, (n_elements, 1)) + knnbig = int(max(5, 0.2 * n_samples)) + + neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knnbig) + csr_array = self.prune_local(neighbor_array, distance_array) + + graph_pruned = self.prune_global( + csr_array=csr_array, + jac_threshold_type=jac_threshold_type, + jac_std_factor=jac_std_factor, + jac_weighted_edges=jac_weighted_edges, + n_samples=n_samples + ) + + partition = self.get_leiden_partition( + graph=graph_pruned, + jac_weighted_edges=jac_weighted_edges + ) + + node_communities = np.asarray(partition.membership) + node_communities = np.reshape(node_communities, (n_samples, 1)) small_pop_list = [] small_cluster_list = [] - small_pop_exist = False - dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True) - for cluster in set(PARC_labels_leiden): - population = len(np.where(PARC_labels_leiden == cluster)[0]) - if population < small_pop: - small_pop_exist = True - small_pop_list.append(list(np.where(PARC_labels_leiden == cluster)[0])) + small_community_exists = False + node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1] + logger.message("Stating small community detection...") + for cluster in set(node_communities): + population = len(np.where(node_communities == cluster)[0]) + if population < self.small_community_size: + small_community_exists = True + small_pop_list.append(list(np.where(node_communities == cluster)[0])) small_cluster_list.append(cluster) for small_cluster in small_pop_list: for single_cell in small_cluster: old_neighbors = neighbor_array[single_cell, :] - group_of_old_neighbors = PARC_labels_leiden[old_neighbors] + group_of_old_neighbors = node_communities[old_neighbors] group_of_old_neighbors = list(group_of_old_neighbors.flatten()) available_neighbours = set(group_of_old_neighbors) - set(small_cluster_list) if len(available_neighbours) > 0: available_neighbours_list = [value for value in group_of_old_neighbors if value in list(available_neighbours)] best_group = max(available_neighbours_list, key=available_neighbours_list.count) - PARC_labels_leiden[single_cell] = best_group + node_communities[single_cell] = best_group - time_smallpop_start = time.time() - print('handling fragments') - while (small_pop_exist) == True & (time.time() - time_smallpop_start < self.time_smallpop): + time_start = time.time() + while small_community_exists and (time.time() - time_start < self.small_community_timeout): small_pop_list = [] - small_pop_exist = False - for cluster in set(list(PARC_labels_leiden.flatten())): - population = len(np.where(PARC_labels_leiden == cluster)[0]) - if population < small_pop: - small_pop_exist = True + small_community_exists = False + for cluster in set(list(node_communities.flatten())): + population = len(np.where(node_communities == cluster)[0]) + if population < self.small_community_size: + small_community_exists = True - small_pop_list.append(np.where(PARC_labels_leiden == cluster)[0]) + small_pop_list.append(np.where(node_communities == cluster)[0]) for small_cluster in small_pop_list: for single_cell in small_cluster: old_neighbors = neighbor_array[single_cell, :] - group_of_old_neighbors = PARC_labels_leiden[old_neighbors] + group_of_old_neighbors = node_communities[old_neighbors] group_of_old_neighbors = list(group_of_old_neighbors.flatten()) best_group = max(set(group_of_old_neighbors), key=group_of_old_neighbors.count) - PARC_labels_leiden[single_cell] = best_group + node_communities[single_cell] = best_group - dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True) + node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1] - return PARC_labels_leiden + return node_communities - def run_subPARC(self): + def fit_predict(self, x_data: np.ndarray | None = None) -> np.ndarray: + """Fit the PARC algorithm to the input data and predict the output labels. + + Args: + x_data: An array of the input x data, with dimensions ``(n_samples, n_features)``. + If ``None``, then the ``x_data`` attribute will be used. + + Returns: + An array of the predicted output y labels, with dimensions ``(n_samples, 1)``. + """ + time_start = time.time() - X_data = self.data - too_big_factor = self.too_big_factor - small_pop = self.small_pop - jac_std_global = self.jac_std_global + if x_data is None and self.x_data is None: + raise ValueError("x_data must be provided.") + elif x_data is None: + x_data = self.x_data + else: + self.x_data = x_data + + n_samples = x_data.shape[0] + n_features = x_data.shape[1] + logger.message( + f"Input data has shape {n_samples} (samples) x {n_features} (features)" + ) + + large_community_factor = self.large_community_factor + small_community_size = self.small_community_size + jac_threshold_type = self.jac_threshold_type + jac_std_factor = self.jac_std_factor jac_weighted_edges = self.jac_weighted_edges knn = self.knn - n_elements = X_data.shape[0] - if self.neighbor_graph is not None: csr_array = self.neighbor_graph neighbor_array = np.split(csr_array.indices, csr_array.indptr)[1:-1] else: - if self.knn_struct is None: - print('knn struct was not available, so making one') - self.knn_struct = self.make_knn_struct() + if self.hnsw_index is None: + logger.message("Creating HNSW index...") + self.hnsw_index = self.create_hnsw_index( + x_data=x_data, + knn=knn, + distance_metric=self.distance_metric, + n_threads=self.n_threads + ) else: - print('knn struct already exists') - neighbor_array, distance_array = self.knn_struct.knn_query(X_data, k=knn) - csr_array = self.make_csrmatrix_noselfloop(neighbor_array, distance_array) - - sources, targets = csr_array.nonzero() - - edgelist = list(zip(sources, targets)) - - edgelist_copy = edgelist.copy() - - G = ig.Graph(edgelist, edge_attrs={'weight': csr_array.data.tolist()}) - # print('average degree of prejacard graph is %.1f'% (np.mean(G.degree()))) - # print('computing Jaccard metric') - sim_list = G.similarity_jaccard(pairs=edgelist_copy) - - print('commencing global pruning') + logger.message("HNSW index already exists.") + neighbor_array, distance_array = self.hnsw_index.knn_query(x_data, k=knn) + csr_array = self.prune_local(neighbor_array, distance_array) + + graph_pruned = self.prune_global( + csr_array=csr_array, + jac_threshold_type=jac_threshold_type, + jac_std_factor=jac_std_factor, + jac_weighted_edges=True, + n_samples=n_samples + ) - sim_list_array = np.asarray(sim_list) - edge_list_copy_array = np.asarray(edgelist_copy) + logger.message("Starting Leiden community detection...") + partition = self.get_leiden_partition( + graph=graph_pruned, + jac_weighted_edges=jac_weighted_edges + ) - if jac_std_global == 'median': - threshold = np.median(sim_list) - else: - threshold = np.mean(sim_list) - jac_std_global * np.std(sim_list) - strong_locs = np.where(sim_list_array > threshold)[0] - # print('Share of edges kept after Global Pruning %.2f' % (len(strong_locs) / len(sim_list)), '%') - new_edgelist = list(edge_list_copy_array[strong_locs]) - sim_list_new = list(sim_list_array[strong_locs]) - - G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist), edge_attrs={'weight': sim_list_new}) - # print('average degree of graph is %.1f' % (np.mean(G_sim.degree()))) - G_sim.simplify(combine_edges='sum') # "first" - # print('average degree of SIMPLE graph is %.1f' % (np.mean(G_sim.degree()))) - print('commencing community detection') - if jac_weighted_edges == True: - start_leiden = time.time() - if self.partition_type =='ModularityVP': - print('partition type MVP') - partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, weights='weight', - n_iterations=self.n_iter_leiden, seed=self.random_seed) - else: - print('partition type RBC') - partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, weights='weight', - n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter = self.resolution_parameter) - #print(time.time() - start_leiden) - else: - start_leiden = time.time() - if self.partition_type == 'ModularityVP': - partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, - n_iterations=self.n_iter_leiden, seed=self.random_seed) - print('partition type MVP') - else: - partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, - n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter = self.resolution_parameter) - print('partition type RBC') - # print(time.time() - start_leiden) - time_end_PARC = time.time() - # print('Q= %.1f' % (partition.quality())) - PARC_labels_leiden = np.asarray(partition.membership) - PARC_labels_leiden = np.reshape(PARC_labels_leiden, (n_elements, 1)) + node_communities = np.asarray(partition.membership) + node_communities = np.reshape(node_communities, (n_samples, 1)) too_big = False - # print('labels found after Leiden', set(list(PARC_labels_leiden.T)[0])) will have some outlier clusters that need to be added to a cluster if a cluster has members that are KNN - - cluster_i_loc = np.where(PARC_labels_leiden == 0)[ - 0] # the 0th cluster is the largest one. so if cluster 0 is not too big, then the others wont be too big either - pop_i = len(cluster_i_loc) - if pop_i > too_big_factor * n_elements: # 0.4 + # The 0th cluster is the largest one. + # So, if cluster 0 is not too big, then the others won't be too big either + large_community_id = 0 + community_indices = np.where(node_communities == large_community_id)[0] + community_size = len(community_indices) + + if community_size > large_community_factor * n_samples: + logger.message( + f"\nCommunity 0 is too large and has size:\n" + f"{community_size} > large_community_factor * n_samples = " + f"{large_community_factor} * {n_samples} = {large_community_factor * n_samples}\n" + f"Starting large community expansion..." + ) too_big = True - cluster_big_loc = cluster_i_loc - list_pop_too_bigs = [pop_i] - cluster_too_big = 0 - - while too_big == True: - - X_data_big = X_data[cluster_big_loc, :] - PARC_labels_leiden_big = self.run_toobig_subPARC(X_data_big) - # print('set of new big labels ', set(PARC_labels_leiden_big.flatten())) - PARC_labels_leiden_big = PARC_labels_leiden_big + 100000 - # print('set of new big labels +100000 ', set(list(PARC_labels_leiden_big.flatten()))) - pop_list = [] - - for item in set(list(PARC_labels_leiden_big.flatten())): - pop_list.append([item, list(PARC_labels_leiden_big.flatten()).count(item)]) - print('pop of big clusters', pop_list) + large_community_indices = community_indices + list_pop_too_bigs = [community_size] + else: + logger.message( + f"\nCommunity 0 is not too large and has size:\n" + f"{community_size} <= large_community_factor * n_samples = " + f"{large_community_factor} * {n_samples} = {large_community_factor * n_samples}\n" + "Skipping large community expansion." + ) + + while too_big: + logger.message(f"Expanding large community {large_community_id}...") + node_communities_big = self.run_toobig_subPARC( + x_data=x_data[large_community_indices, :] + ) + node_communities_big = node_communities_big + 100000 + jj = 0 - print('shape PARC_labels_leiden', PARC_labels_leiden.shape) - for j in cluster_big_loc: - PARC_labels_leiden[j] = PARC_labels_leiden_big[jj] + for j in large_community_indices: + node_communities[j] = node_communities_big[jj] jj = jj + 1 - dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True) - print('new set of labels ', set(PARC_labels_leiden)) + node_communities = np.unique( + list(node_communities.flatten()), return_inverse=True + )[1] + too_big = False - set_PARC_labels_leiden = set(PARC_labels_leiden) - - PARC_labels_leiden = np.asarray(PARC_labels_leiden) - for cluster_ii in set_PARC_labels_leiden: - cluster_ii_loc = np.where(PARC_labels_leiden == cluster_ii)[0] - pop_ii = len(cluster_ii_loc) - not_yet_expanded = pop_ii not in list_pop_too_bigs - if pop_ii > too_big_factor * n_elements and not_yet_expanded == True: + node_communities = np.asarray(node_communities) + for community_id in set(node_communities): + community_indices = np.where(node_communities == community_id)[0] + community_size = len(community_indices) + not_yet_expanded = community_size not in list_pop_too_bigs + if community_size > large_community_factor * n_samples and not_yet_expanded: too_big = True - print('cluster', cluster_ii, 'is too big and has population', pop_ii) - cluster_big_loc = cluster_ii_loc - cluster_big = cluster_ii - big_pop = pop_ii - if too_big == True: - list_pop_too_bigs.append(big_pop) - print('cluster', cluster_big, 'is too big with population', big_pop, '. It will be expanded') - dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True) + logger.message( + f"Community {community_id} is too big and has population {community_size}." + ) + large_community_indices = community_indices + large_community_id = community_id + large_community_size = community_size + if too_big: + list_pop_too_bigs.append(large_community_size) + logger.message( + f"Community {large_community_id} is too big and has population " + f"{large_community_size}. It will be expanded." + ) + node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1] + + logger.message("Starting small community detection...") small_pop_list = [] small_cluster_list = [] - small_pop_exist = False - - for cluster in set(PARC_labels_leiden): - population = len(np.where(PARC_labels_leiden == cluster)[0]) - - if population < small_pop: # 10 - small_pop_exist = True - - small_pop_list.append(list(np.where(PARC_labels_leiden == cluster)[0])) + small_community_exists = False + + for cluster in set(node_communities): + population = len(np.where(node_communities == cluster)[0]) + if population < small_community_size: + logger.info( + f"Community {cluster} is a small community with population {population}" + ) + small_community_exists = True + small_pop_list.append(list(np.where(node_communities == cluster)[0])) small_cluster_list.append(cluster) for small_cluster in small_pop_list: for single_cell in small_cluster: old_neighbors = neighbor_array[single_cell] - group_of_old_neighbors = PARC_labels_leiden[old_neighbors] + group_of_old_neighbors = node_communities[old_neighbors] group_of_old_neighbors = list(group_of_old_neighbors.flatten()) available_neighbours = set(group_of_old_neighbors) - set(small_cluster_list) if len(available_neighbours) > 0: available_neighbours_list = [value for value in group_of_old_neighbors if value in list(available_neighbours)] best_group = max(available_neighbours_list, key=available_neighbours_list.count) - PARC_labels_leiden[single_cell] = best_group - time_smallpop_start = time.time() - while (small_pop_exist == True) & ((time.time() - time_smallpop_start) < self.time_smallpop): + node_communities[single_cell] = best_group + time_start_sc = time.time() + while small_community_exists and (time.time() - time_start_sc) < self.small_community_timeout: small_pop_list = [] - small_pop_exist = False - for cluster in set(list(PARC_labels_leiden.flatten())): - population = len(np.where(PARC_labels_leiden == cluster)[0]) - if population < small_pop: - small_pop_exist = True - print(cluster, ' has small population of', population, ) - small_pop_list.append(np.where(PARC_labels_leiden == cluster)[0]) + small_community_exists = False + for cluster in set(list(node_communities.flatten())): + population = len(np.where(node_communities == cluster)[0]) + if population < small_community_size: + logger.info( + f"Community {cluster} is a small community with population {population}" + ) + small_community_exists = True + small_pop_list.append(np.where(node_communities == cluster)[0]) for small_cluster in small_pop_list: for single_cell in small_cluster: old_neighbors = neighbor_array[single_cell] - group_of_old_neighbors = PARC_labels_leiden[old_neighbors] + group_of_old_neighbors = node_communities[old_neighbors] group_of_old_neighbors = list(group_of_old_neighbors.flatten()) best_group = max(set(group_of_old_neighbors), key=group_of_old_neighbors.count) - PARC_labels_leiden[single_cell] = best_group + node_communities[single_cell] = best_group + + node_communities = np.unique(list(node_communities.flatten()), return_inverse=True)[1] + node_communities = list(node_communities.flatten()) - dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True) - PARC_labels_leiden = list(PARC_labels_leiden.flatten()) - # print('final labels allocation', set(PARC_labels_leiden)) - pop_list = [] - for item in set(PARC_labels_leiden): - pop_list.append((item, PARC_labels_leiden.count(item))) - print('list of cluster labels and populations', len(pop_list), pop_list) + community_counts = pd.DataFrame({ + "community_id": list(set(node_communities)), + "count": [ + node_communities.count(community_id) for community_id in set(node_communities) + ] + }) + logger.message(f"Community labels and sizes:\n{community_counts}") - self.labels = PARC_labels_leiden # list - return + self.y_data_pred = node_communities + run_time = time.time() - time_start + logger.message(f"Time elapsed to run PARC: {run_time:.1f} seconds") + self.compute_performance_metrics(run_time) + return self.y_data_pred - def accuracy(self, onevsall=1): + def accuracy(self, target=1): - true_labels = self.true_label + y_data_true = self.y_data_true Index_dict = {} - PARC_labels = self.labels - N = len(PARC_labels) - n_cancer = list(true_labels).count(onevsall) - n_pbmc = N - n_cancer + y_data_pred = self.y_data_pred + n_samples = len(y_data_pred) + n_target = list(y_data_true).count(target) + n_pbmc = n_samples - n_target - for k in range(N): - Index_dict.setdefault(PARC_labels[k], []).append(true_labels[k]) + for k in range(n_samples): + Index_dict.setdefault(y_data_pred[k], []).append(y_data_true[k]) num_groups = len(Index_dict) sorted_keys = list(sorted(Index_dict.keys())) error_count = [] - pbmc_labels = [] - thp1_labels = [] + negative_labels = [] + positive_labels = [] fp, fn, tp, tn, precision, recall, f1_score = 0, 0, 0, 0, 0, 0, 0 for kk in sorted_keys: vals = [t for t in Index_dict[kk]] - majority_val = self.func_mode(vals) - if majority_val == onevsall: print('cluster', kk, ' has majority', onevsall, 'with population', len(vals)) + majority_val = get_mode(vals) + if majority_val == target: + logger.info(f"Cluster {kk} has majority {target} with population {len(vals)}") if kk == -1: len_unknown = len(vals) - print('len unknown', len_unknown) - if (majority_val == onevsall) and (kk != -1): - thp1_labels.append(kk) - fp = fp + len([e for e in vals if e != onevsall]) - tp = tp + len([e for e in vals if e == onevsall]) + logger.info(f"len unknown: {len_unknown}") + if (majority_val == target) and (kk != -1): + positive_labels.append(kk) + fp = fp + len([e for e in vals if e != target]) + tp = tp + len([e for e in vals if e == target]) list_error = [e for e in vals if e != majority_val] e_count = len(list_error) error_count.append(e_count) - elif (majority_val != onevsall) and (kk != -1): - pbmc_labels.append(kk) - tn = tn + len([e for e in vals if e != onevsall]) - fn = fn + len([e for e in vals if e == onevsall]) + elif (majority_val != target) and (kk != -1): + negative_labels.append(kk) + tn = tn + len([e for e in vals if e != target]) + fn = fn + len([e for e in vals if e == target]) error_count.append(len([e for e in vals if e != majority_val])) - predict_class_array = np.array(PARC_labels) - PARC_labels_array = np.array(PARC_labels) - number_clusters_for_target = len(thp1_labels) - for cancer_class in thp1_labels: - predict_class_array[PARC_labels_array == cancer_class] = 1 - for benign_class in pbmc_labels: - predict_class_array[PARC_labels_array == benign_class] = 0 + predict_class_array = np.array(y_data_pred) + y_data_pred_array = np.array(y_data_pred) + number_clusters_for_target = len(positive_labels) + for cancer_class in positive_labels: + predict_class_array[y_data_pred_array == cancer_class] = 1 + for benign_class in negative_labels: + predict_class_array[y_data_pred_array == benign_class] = 0 predict_class_array.reshape((predict_class_array.shape[0], -1)) - error_rate = sum(error_count) / N + error_rate = sum(error_count) / n_samples n_target = tp + fn tnr = tn / n_pbmc - fnr = fn / n_cancer - tpr = tp / n_cancer + fnr = fn / n_target + tpr = tp / n_target fpr = fp / n_pbmc - if tp != 0 or fn != 0: recall = tp / (tp + fn) # ability to find all positives - if tp != 0 or fp != 0: precision = tp / (tp + fp) # ability to not misclassify negatives as positives + if tp != 0 or fn != 0: + recall = tp / (tp + fn) # ability to find all positives + if tp != 0 or fp != 0: + precision = tp / (tp + fp) # ability to not misclassify negatives as positives if precision != 0 or recall != 0: f1_score = precision * recall * 2 / (precision + recall) - majority_truth_labels = np.empty((len(true_labels), 1), dtype=object) + majority_truth_labels = np.empty((len(y_data_true), 1), dtype=object) - for cluster_i in set(PARC_labels): - cluster_i_loc = np.where(np.asarray(PARC_labels) == cluster_i)[0] - true_labels = np.asarray(true_labels) - majority_truth = self.func_mode(list(true_labels[cluster_i_loc])) - majority_truth_labels[cluster_i_loc] = majority_truth + for community_id in set(y_data_pred): + community_indices = np.where(np.asarray(y_data_pred) == community_id)[0] + y_data_true = np.asarray(y_data_true) + majority_truth = get_mode(list(y_data_true[community_indices])) + majority_truth_labels[community_indices] = majority_truth majority_truth_labels = list(majority_truth_labels.flatten()) accuracy_val = [error_rate, f1_score, tnr, fnr, tpr, fpr, precision, @@ -526,65 +967,82 @@ def accuracy(self, onevsall=1): return accuracy_val, predict_class_array, majority_truth_labels, number_clusters_for_target - def run_PARC(self): - print('input data has shape', self.data.shape[0], '(samples) x', self.data.shape[1], '(features)') - if self.true_label is None: - self.true_label = [1] * self.data.shape[0] - list_roc = [] - - time_start_total = time.time() - - time_start_knn = time.time() - - time_end_knn_struct = time.time() - time_start_knn - # Query dataset, k - number of closest elements (returns 2 numpy arrays) - self.run_subPARC() - run_time = time.time() - time_start_total - print('time elapsed {:.1f} seconds'.format(run_time)) + def compute_performance_metrics(self, run_time: float): + """Compute performance metrics for the PARC algorithm. - targets = list(set(self.true_label)) - N = len(list(self.true_label)) + Args: + run_time: (float) the time taken to run the PARC algorithm. + """ + list_roc = [] + targets = list(set(self.y_data_true)) + n_samples = len(list(self.y_data_true)) self.f1_accumulated = 0 self.f1_mean = 0 - self.stats_df = pd.DataFrame({'jac_std_global': [self.jac_std_global], 'dist_std_local': [self.dist_std_local], - 'runtime(s)': [run_time]}) + self.stats_df = pd.DataFrame({ + "jac_std_factor": [self.jac_std_factor], + "l2_std_factor": [self.l2_std_factor], + "runtime(s)": [run_time] + }) self.majority_truth_labels = [] if len(targets) > 1: f1_accumulated = 0 f1_acc_noweighting = 0 - for onevsall_val in targets: - print('target is', onevsall_val) - vals_roc, predict_class_array, majority_truth_labels, numclusters_targetval = self.accuracy( - onevsall=onevsall_val) + for target in targets: + logger.info(f"Target is {target}") + vals_roc, predict_class_array, majority_truth_labels, numclusters_targetval = \ + self.accuracy(target=target) f1_current = vals_roc[1] - print('target', onevsall_val, 'has f1-score of %.2f' % (f1_current * 100)) - f1_accumulated = f1_accumulated + f1_current * (list(self.true_label).count(onevsall_val)) / N + logger.info(f"Target {target} has f1-score of {(f1_current * 100):.2f}") + f1_accumulated = f1_accumulated + \ + f1_current * (list(self.y_data_true).count(target)) / n_samples f1_acc_noweighting = f1_acc_noweighting + f1_current list_roc.append( - [self.jac_std_global, self.dist_std_local, onevsall_val] + vals_roc + [numclusters_targetval] + [ - run_time]) + [self.jac_std_factor, self.l2_std_factor, target] + + vals_roc + + [numclusters_targetval] + + [run_time] + ) f1_mean = f1_acc_noweighting / len(targets) - print("f1-score (unweighted) mean %.2f" % (f1_mean * 100), '%') - print('f1-score weighted (by population) %.2f' % (f1_accumulated * 100), '%') - df_accuracy = pd.DataFrame(list_roc, - columns=['jac_std_global', 'dist_std_local', 'onevsall-target', 'error rate', - 'f1-score', 'tnr', 'fnr', - 'tpr', 'fpr', 'precision', 'recall', 'num_groups', - 'population of target', 'num clusters', 'clustering runtime']) + df_accuracy = pd.DataFrame( + list_roc, + columns=[ + "jac_std_factor", "l2_std_factor", "target", "error rate", + "f1-score", "tnr", "fnr", "tpr", "fpr", "precision", "recall", "num_groups", + "target population", "num clusters", "clustering runtime" + ] + ) + + logger.message(f"f1-score (unweighted) mean: {(f1_mean * 100):.2f}") + logger.message(f"f1-score weighted (by population): {(f1_accumulated * 100):.2f}") + logger.message( + f"\n{df_accuracy[['target', 'f1-score', 'target population', 'num clusters']]}" + ) self.f1_accumulated = f1_accumulated self.f1_mean = f1_mean self.stats_df = df_accuracy self.majority_truth_labels = majority_truth_labels - return - def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0, - negative_sample_rate: int = 5, gamma: float = 1.0, spread=1.0, min_dist=0.1, - n_epochs=0, init_pos="spectral", random_state_seed=1, densmap=False, - densmap_kwds={}, output_dens=False): + def run_umap_hnsw( + self, + x_data, + graph, + n_components=2, + alpha: float = 1.0, + negative_sample_rate: int = 5, + gamma: float = 1.0, + spread=1.0, + min_dist=0.1, + n_epochs=0, + init_pos="spectral", + random_state_seed=1, + densmap=False, + densmap_kwds={}, + output_dens=False + ): """Perform a fuzzy simplicial set embedding, using a specified initialisation method and then minimizing the fuzzy set cross entropy between the 1-skeletons of the high and low-dimensional fuzzy simplicial sets. @@ -593,7 +1051,7 @@ def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0, `__. Args: - X_input: (array) an array containing the input data, with shape n_samples x n_features. + x_data: (array) an array containing the input data, with shape n_samples x n_features. graph: (array) the 1-skeleton of the high dimensional fuzzy simplicial set as represented by a graph for which we require a sparse matrix for the (weighted) adjacency matrix. @@ -633,10 +1091,10 @@ def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0, """ a, b = find_ab_params(spread, min_dist) - print(f"a: {a}, b: {b}, spread: {spread}, dist: {min_dist}") + logger.message(f"a: {a}, b: {b}, spread: {spread}, dist: {min_dist}") X_umap = simplicial_set_embedding( - data=X_input, + data=x_data, graph=graph, n_components=n_components, initial_alpha=alpha, @@ -655,3 +1113,76 @@ def run_umap_hnsw(self, X_input, graph, n_components=2, alpha: float = 1.0, output_dens=output_dens ) return X_umap[0] + + def load(self, file_path: pathlib.Path | str): + """Load a PARC object from a file. + + Args: + file_path: The full path name of the JSON file to load the ``PARC`` object from. + """ + logger.message(f"Loading PARC object from: {file_path}") + + file_path = pathlib.Path(file_path).resolve() + if not os.path.isfile(file_path): + raise ValueError(f"{file_path} is not a valid file.") + + if file_path.suffix != ".json": + raise ValueError(f"file_path must have a .json extension, got {file_path.suffix}.") + + with open(file_path, "r") as file: + model_dict = json.load(file) + + self.knn = model_dict["knn"] + self.n_iter_leiden = model_dict["n_iter_leiden"] + self.random_seed = model_dict["random_seed"] + self.distance_metric = model_dict["distance_metric"] + self.n_threads = model_dict["n_threads"] + self.hnsw_param_ef_construction = model_dict["hnsw_param_ef_construction"] + self.l2_std_factor = model_dict["l2_std_factor"] + self.jac_threshold_type = model_dict["jac_threshold_type"] + self.jac_std_factor = model_dict["jac_std_factor"] + self.jac_weighted_edges = model_dict["jac_weighted_edges"] + self.do_prune_local = model_dict["do_prune_local"] + self.large_community_factor = model_dict["large_community_factor"] + self.small_community_size = model_dict["small_community_size"] + self.small_community_timeout = model_dict["small_community_timeout"] + self.resolution_parameter = model_dict["resolution_parameter"] + self.partition_type = model_dict["partition_type"] + + def save(self, file_path: pathlib.Path | str): + """Save the PARC object to a file. + + Args: + file_path: The full path name of the JSON file to save the ``PARC`` object to. + """ + + file_path = pathlib.Path(file_path).resolve() + if not os.path.isdir(file_path.parent): + raise ValueError(f"{file_path.parent} is not a valid directory.") + + if file_path.suffix != ".json": + raise ValueError(f"file_path must have a .json extension, got {file_path.suffix}.") + + model_dict = { + "knn": self.knn, + "n_iter_leiden": self.n_iter_leiden, + "random_seed": self.random_seed, + "distance_metric": self.distance_metric, + "n_threads": self.n_threads, + "hnsw_param_ef_construction": self.hnsw_param_ef_construction, + "l2_std_factor": self.l2_std_factor, + "jac_threshold_type": self.jac_threshold_type, + "jac_std_factor": self.jac_std_factor, + "jac_weighted_edges": self.jac_weighted_edges, + "do_prune_local": self.do_prune_local, + "large_community_factor": self.large_community_factor, + "small_community_size": self.small_community_size, + "small_community_timeout": self.small_community_timeout, + "resolution_parameter": self.resolution_parameter, + "partition_type": self.partition_type + } + + with open(file_path, "w") as file: + json.dump(model_dict, file) + + logger.message(f"PARC object saved to: {file_path}") diff --git a/parc/logger.py b/parc/logger.py new file mode 100644 index 0000000..0654545 --- /dev/null +++ b/parc/logger.py @@ -0,0 +1,95 @@ +"""Custom colored logger. + +See: + +`StackOverflow `__ +`python-colors GitHub `__ + +""" + +import logging +import sys + +MIN_LEVEL = logging.DEBUG +MESSAGE = 25 +logging.addLevelName(MESSAGE, "MESSAGE") +LOGGING_LEVEL = 25 + + +class LogFilter(logging.Filter): + """Filters (lets through) all messages with level < LEVEL""" + def __init__(self, level): + self.level = level + + def filter(self, record): + # "<" instead of "<=": since logger.setLevel is inclusive, this should + # be exclusive + return record.levelno < self.level + + +class Logger(logging.Logger): + def message(self, msg, *args, **kwargs): + if self.isEnabledFor(MESSAGE): + self._log(MESSAGE, msg, args, **kwargs) + + +class ColoredFormatter(logging.Formatter): + def __init__(self, fmt_prefix, fmt_msg): + self.use_color = self.supports_color() + self.fmt_prefix = fmt_prefix + self.fmt_msg = fmt_msg + super().__init__(fmt=f"{fmt_prefix} {fmt_msg}") + + def format(self, record: logging.LogRecord) -> str: + if record.levelno == logging.WARNING: + if self.use_color: + fmt = f"\x1b[93m{self.fmt_prefix}\x1b[0m {self.fmt_msg}" + formatter = logging.Formatter(fmt) + return formatter.format(record) + else: + return super().format(record) + elif record.levelno == logging.ERROR: + if self.use_color: + fmt = f"\x1b[91m{self.fmt_prefix}\x1b[0m {self.fmt_msg}" + formatter = logging.Formatter(fmt) + return formatter.format(record) + else: + return super().format(record) + elif record.levelno == 25 or record.levelno == logging.INFO: + if self.use_color: + fmt = f"\x1b[96m{self.fmt_prefix}\x1b[0m {self.fmt_msg}" + formatter = logging.Formatter(fmt) + return formatter.format(record) + else: + return super().format(record) + + def supports_color(self) -> bool: + """Check if the system supports ANSI color formatting.""" + return hasattr(sys.stdout, "isatty") and sys.stdout.isatty() + + +def get_logger(module_name, level: int = LOGGING_LEVEL) -> Logger: + logging.setLoggerClass(Logger) + stdout_handler = logging.StreamHandler(sys.stdout) + stderr_handler = logging.StreamHandler(sys.stderr) + stdout_handler.addFilter(LogFilter(logging.WARNING)) + stdout_handler.setLevel(level) + if level == 25: + formatter = ColoredFormatter( + fmt_prefix="[%(levelname)s]:", + fmt_msg="%(message)s" + ) + else: + formatter = ColoredFormatter( + fmt_prefix="[%(levelname)s] %(name)s.%(funcName)s (line %(lineno)d):", + fmt_msg="%(message)s" + ) + stdout_handler.setFormatter(formatter) + stderr_handler.setLevel(max(MIN_LEVEL, logging.WARNING)) + # messages lower than WARNING go to stdout + # messages >= WARNING (and >= STDOUT_LOG_LEVEL) go to stderr + logger = logging.getLogger(module_name) + logger.propagate = False + logger.handlers = [stdout_handler, stderr_handler] + logger.setLevel(level) + return logger diff --git a/parc/utils.py b/parc/utils.py new file mode 100644 index 0000000..65e5d35 --- /dev/null +++ b/parc/utils.py @@ -0,0 +1,13 @@ +def get_mode(a_list: list[any]) -> any: + """Get the value which appears most often in the list (the mode). + + If multiple items are maximal, the function returns the first one encountered + (not necessarily the first one in the list). + + Args: + a_list: A list with values. + + Returns: + The most frequent value in the list + """ + return max(set(a_list), key=a_list.count) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..0426d55 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +log_cli = true +log_cli_level = 25 +addopts = -rP \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..17ad205 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +pybind11 +numpy +scipy +pandas +hnswlib +igraph +leidenalg>=0.7.0 +umap-learn \ No newline at end of file diff --git a/setup.py b/setup.py index a38f07f..4831f3b 100644 --- a/setup.py +++ b/setup.py @@ -1,25 +1,25 @@ import setuptools - from os import path -this_directory = path.abspath(path.dirname(__file__)) -with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: + +PARC_DIR = path.abspath(path.dirname(__file__)) + +with open(path.join(PARC_DIR, "README.md"), encoding="utf-8") as f: long_description = f.read() setuptools.setup( - name='parc', - version='0.40', #May 28,2024 - packages=['parc',], - license='MIT', - author_email='shobana.venkat88@gmail.com', - url='https://github.com/ShobiStassen/PARC', - setup_requires=['numpy', 'pybind11'], + name="parc", + version="0.40", + packages=["parc"], + license="MIT", + author_email="shobana.venkat88@gmail.com", + url="https://github.com/ShobiStassen/PARC", + setup_requires=["numpy", "pybind11"], install_requires=[ - 'pybind11', 'numpy', 'scipy', 'pandas', 'hnswlib', 'igraph', - 'leidenalg>=0.7.0', 'umap-learn' + line.strip() for line in open("requirements.txt") ], extras_require={ "dev": ["pytest", "scikit-learn"] }, long_description=long_description, - long_description_content_type='text/markdown' + long_description_content_type="text/markdown" ) diff --git a/tests/test_parc.py b/tests/test_parc.py index 5d6d523..c282fed 100644 --- a/tests/test_parc.py +++ b/tests/test_parc.py @@ -1,5 +1,38 @@ +import pytest from sklearn import datasets +import igraph as ig +import numpy as np +import pathlib +import json +import scipy +import igraph +import hnswlib from parc._parc import PARC +from parc.logger import get_logger +from tests.variables import NEIGHBOR_ARRAY_L2, NEIGHBOR_ARRAY_COSINE +from tests.utils import __tmp_dir__, create_tmp_dir, remove_tmp_dir + +logger = get_logger(__name__, 25) + + +def setup_function(): + create_tmp_dir() + + +@pytest.fixture +def iris_data(): + iris = datasets.load_iris() + x_data = iris.data + y_data = iris.target + return x_data, y_data + + +@pytest.fixture +def forest_data(): + forests = datasets.fetch_covtype() + x_data = forests.data[list(range(0, 30000)), :] + y_data = forests.target[list(range(0, 30000))] + return x_data, y_data def test_parc_run_umap_hnsw(): @@ -7,9 +40,385 @@ def test_parc_run_umap_hnsw(): x_data = iris.data y_data = iris.target - parc_model = PARC(x_data, true_label=y_data) - parc_model.run_PARC() + parc_model = PARC(x_data, y_data_true=y_data) + parc_model.fit_predict() - graph = parc_model.knngraph_full() + graph = parc_model.create_knn_graph() x_umap = parc_model.run_umap_hnsw(x_data, graph) assert x_umap.shape == (150, 2) + + +@pytest.mark.parametrize("knn", [5, 10, 20, 50]) +@pytest.mark.parametrize("jac_weighted_edges", [True, False]) +def test_parc_get_leiden_partition(iris_data, knn, jac_weighted_edges): + x_data = iris_data[0] + y_data = iris_data[1] + + parc_model = PARC(x_data, y_data_true=y_data) + hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn) + neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn) + csr_array = parc_model.prune_local(neighbor_array, distance_array) + + input_nodes, output_nodes = csr_array.nonzero() + + edges = list(zip(input_nodes, output_nodes)) + + graph = ig.Graph(edges, edge_attrs={"weight": csr_array.data.tolist()}) + + leiden_partition = parc_model.get_leiden_partition( + graph=graph, jac_weighted_edges=jac_weighted_edges + ) + + assert len(leiden_partition.membership) == len(y_data) + assert len(leiden_partition) <= len(y_data) + assert len(leiden_partition) >= 1 + + +@pytest.mark.parametrize( + "knn_hnsw, knn_query, distance_metric, expected_neighbor_array", + [ + (30, 2, "l2", NEIGHBOR_ARRAY_L2), + (30, 2, "cosine", NEIGHBOR_ARRAY_COSINE) + ] +) +def test_parc_create_hnsw_index( + iris_data, knn_hnsw, knn_query, distance_metric, expected_neighbor_array +): + x_data = iris_data[0] + y_data = iris_data[1] + parc_model = PARC(x_data=x_data, y_data_true=y_data) + hnsw_index = parc_model.create_hnsw_index( + x_data=x_data, + knn=knn_hnsw, + distance_metric=distance_metric + ) + assert isinstance(hnsw_index, hnswlib.Index) + neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn_query) + assert neighbor_array.shape == (x_data.shape[0], knn_query) + assert distance_array.shape == (x_data.shape[0], knn_query) + n_diff = np.count_nonzero(expected_neighbor_array != neighbor_array) + assert n_diff / x_data.shape[0] < 0.1 + + +@pytest.mark.parametrize("knn", [5, 10, 20, 50]) +def test_parc_create_knn_graph(iris_data, knn): + x_data = iris_data[0] + y_data = iris_data[1] + + parc_model = PARC(x_data, y_data_true=y_data) + parc_model.hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn) + csr_array = parc_model.create_knn_graph(knn=knn) + nn_collection = np.split(csr_array.indices, csr_array.indptr)[1:-1] + assert len(nn_collection) == y_data.shape[0] + + +@pytest.mark.parametrize( + "knn, l2_std_factor, n_edges", + [ + (100, 3.0, 14625), + (100, 100.0, 14850), + (100, -100.0, 0) + ] +) +def test_parc_prune_local( + iris_data, knn, l2_std_factor, n_edges +): + x_data = iris_data[0] + y_data = iris_data[1] + parc_model = PARC(x_data=x_data, y_data_true=y_data) + hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn) + neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn) + csr_array = parc_model.prune_local(neighbor_array, distance_array, l2_std_factor) + input_nodes, output_nodes = csr_array.nonzero() + edges = list(zip(input_nodes, output_nodes)) + assert isinstance(csr_array, scipy.sparse.csr_matrix) + assert len(edges) == n_edges + + +@pytest.mark.parametrize( + "knn, jac_threshold_type, jac_std_factor, jac_weighted_edges, n_edges", + [ + (100, "median", 0.3, True, 3679), + (100, "mean", 0.3, False, 3865), + (100, "mean", -1000, False, 0), + (100, "mean", 1000, False, 8558) + ] +) +def test_parc_prune_global( + iris_data, knn, jac_threshold_type, jac_std_factor, jac_weighted_edges, n_edges +): + x_data = iris_data[0] + y_data = iris_data[1] + parc_model = PARC(x_data=x_data, y_data_true=y_data) + hnsw_index = parc_model.create_hnsw_index(x_data=x_data, knn=knn) + neighbor_array, distance_array = hnsw_index.knn_query(x_data, k=knn) + csr_array = parc_model.prune_local(neighbor_array, distance_array) + graph_pruned = parc_model.prune_global( + csr_array=csr_array, + jac_threshold_type=jac_threshold_type, + jac_std_factor=jac_std_factor, + jac_weighted_edges=jac_weighted_edges, + n_samples=x_data.shape[0] + ) + assert isinstance(graph_pruned, igraph.Graph) + assert graph_pruned.ecount() == n_edges + assert graph_pruned.vcount() == x_data.shape[0] + + +@pytest.mark.parametrize( + ( + "dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction," + "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local," + "large_community_factor, small_community_size, small_community_timeout," + "resolution_parameter, partition_type," + "f1_mean, f1_accumulated" + ), + [ + ( + "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0, + "ModularityVP", 0.9, 0.9 + ), + ( + "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.15, 10, 15, 1.0, + "ModularityVP", 0.9, 0.9 + ) + ] +) +@pytest.mark.parametrize( + "targets_exist", + [ + (True), + (False) + ] +) +def test_parc_fit_predict_fast( + request, dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction, + l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local, + large_community_factor, small_community_size, small_community_timeout, + resolution_parameter, partition_type, f1_mean, f1_accumulated, targets_exist +): + x_data, y_data = request.getfixturevalue(dataset_name) + if not targets_exist: + y_data = None + + parc_model = PARC( + x_data=x_data, + y_data_true=y_data, + knn=knn, + n_iter_leiden=n_iter_leiden, + distance_metric=distance_metric, + hnsw_param_ef_construction=hnsw_param_ef_construction, + l2_std_factor=l2_std_factor, + jac_threshold_type=jac_threshold_type, + jac_std_factor=jac_std_factor, + jac_weighted_edges=jac_weighted_edges, + do_prune_local=do_prune_local, + large_community_factor=large_community_factor, + small_community_size=small_community_size, + small_community_timeout=small_community_timeout, + resolution_parameter=resolution_parameter, + partition_type=partition_type + ) + + parc_model.fit_predict() + if targets_exist: + assert parc_model.f1_mean >= f1_mean + assert parc_model.f1_accumulated >= f1_accumulated + else: + assert parc_model.f1_mean == 0 + assert parc_model.f1_accumulated == 0 + assert len(parc_model.y_data_pred) == x_data.shape[0] + + +@pytest.mark.parametrize( + ( + "dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction," + "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local," + "large_community_factor, small_community_size, small_community_timeout," + "resolution_parameter, partition_type," + "f1_mean, f1_accumulated" + ), + [ + ( + "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0, + "ModularityVP", 0.9, 0.9 + ), + ( + "iris_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.15, 10, 15, 1.0, + "ModularityVP", 0.9, 0.9 + ), + ( + "iris_data", 2, 10, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0, + "ModularityVP", 0.9, 0.9 + ), + ( + "forest_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.019, 10, 15, 1.0, + "ModularityVP", 0.6, 0.7 + ), + ( + "forest_data", 30, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0, + "ModularityVP", 0.6, 0.7 + ), + ( + "forest_data", 100, 5, "l2", 150, 3.0, "median", 0.15, True, True, 0.4, 10, 15, 1.0, + "ModularityVP", 0.5, 0.6 + ) + ] +) +@pytest.mark.parametrize( + "targets_exist", + [ + (True), + (False) + ] +) +def test_parc_fit_predict_full( + request, dataset_name, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction, + l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local, + large_community_factor, small_community_size, small_community_timeout, + resolution_parameter, partition_type, f1_mean, f1_accumulated, targets_exist +): + x_data, y_data = request.getfixturevalue(dataset_name) + if not targets_exist: + y_data = None + + parc_model = PARC( + x_data=x_data, + y_data_true=y_data, + knn=knn, + n_iter_leiden=n_iter_leiden, + distance_metric=distance_metric, + hnsw_param_ef_construction=hnsw_param_ef_construction, + l2_std_factor=l2_std_factor, + jac_threshold_type=jac_threshold_type, + jac_std_factor=jac_std_factor, + jac_weighted_edges=jac_weighted_edges, + do_prune_local=do_prune_local, + large_community_factor=large_community_factor, + small_community_size=small_community_size, + small_community_timeout=small_community_timeout, + resolution_parameter=resolution_parameter, + partition_type=partition_type + ) + + parc_model.fit_predict() + if targets_exist: + assert parc_model.f1_mean >= f1_mean + assert parc_model.f1_accumulated >= f1_accumulated + else: + assert parc_model.f1_mean == 0 + assert parc_model.f1_accumulated == 0 + assert len(parc_model.y_data_pred) == x_data.shape[0] + + +@pytest.mark.parametrize( + ( + "file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction," + "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local," + "large_community_factor, small_community_size, small_community_timeout," + "resolution_parameter, partition_type" + ), + [ + ( + pathlib.Path(__tmp_dir__, "parc_model.json"), 30, 5, "l2", 150, 3.0, + "median", 0.15, True, True, 0.4, 10, 15, 1.0, "ModularityVP" + ) + ] +) +def test_parc_save( + iris_data, file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction, + l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local, + large_community_factor, small_community_size, small_community_timeout, + resolution_parameter, partition_type +): + + x_data = iris_data[0] + y_data = iris_data[1] + parc_model = PARC( + x_data=x_data, + y_data_true=y_data, + knn=knn, + n_iter_leiden=n_iter_leiden, + distance_metric=distance_metric, + hnsw_param_ef_construction=hnsw_param_ef_construction, + l2_std_factor=l2_std_factor, + jac_threshold_type=jac_threshold_type, + jac_std_factor=jac_std_factor, + jac_weighted_edges=jac_weighted_edges, + do_prune_local=do_prune_local, + large_community_factor=large_community_factor, + small_community_size=small_community_size, + small_community_timeout=small_community_timeout, + resolution_parameter=resolution_parameter, + partition_type=partition_type + ) + parc_model.fit_predict() + parc_model.save(file_path) + assert file_path.exists() + with open(file_path) as file: + model_dict = json.load(file) + + for key, value in model_dict.items(): + assert getattr(parc_model, key) == value + + +@pytest.mark.parametrize( + ( + "file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction," + "l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local," + "large_community_factor, small_community_size, small_community_timeout," + "resolution_parameter, partition_type" + ), + [ + ( + pathlib.Path(__tmp_dir__, "parc_model.json"), 30, 5, "l2", 150, 3.0, + "median", 0.15, True, True, 0.4, 10, 15, 1.0, "ModularityVP" + ) + ] +) +def test_parc_load( + iris_data, file_path, knn, n_iter_leiden, distance_metric, hnsw_param_ef_construction, + l2_std_factor, jac_threshold_type, jac_std_factor, jac_weighted_edges, do_prune_local, + large_community_factor, small_community_size, small_community_timeout, + resolution_parameter, partition_type +): + + x_data = iris_data[0] + y_data = iris_data[1] + parc_model = PARC( + x_data=x_data, + y_data_true=y_data, + knn=knn, + n_iter_leiden=n_iter_leiden, + distance_metric=distance_metric, + hnsw_param_ef_construction=hnsw_param_ef_construction, + l2_std_factor=l2_std_factor, + jac_threshold_type=jac_threshold_type, + jac_std_factor=jac_std_factor, + jac_weighted_edges=jac_weighted_edges, + do_prune_local=do_prune_local, + large_community_factor=large_community_factor, + small_community_size=small_community_size, + small_community_timeout=small_community_timeout, + resolution_parameter=resolution_parameter, + partition_type=partition_type + ) + parc_model.fit_predict() + parc_model.save(file_path) + + parc_model_saved = PARC( + x_data=x_data, + y_data_true=y_data, + file_path=file_path + ) + + with open(file_path) as file: + model_dict = json.load(file) + + for key, value in model_dict.items(): + assert getattr(parc_model_saved, key) == value + + + +def teardown_function(): + remove_tmp_dir() diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..bc422ea --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,14 @@ +import pytest +from parc.utils import get_mode + + +@pytest.mark.parametrize( + "input_list, expected_mode", + [ + ([1, 2, 3, 4, 5], [1, 2, 3, 4, 5]), + ([1, 2, 3, 4, 2, 6], [2]), + (["a", "b", "c", "d", "e", "a", "b", "c"], ["a", "b", "c"]) + ] +) +def test_get_mode(input_list, expected_mode): + assert get_mode(input_list) in expected_mode diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..affb6c0 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,25 @@ +import shutil +import pathlib + +__path_parc__ = pathlib.Path(__file__).parents[1].absolute() +__test_dir__ = pathlib.Path(__path_parc__, "tests") +__tmp_dir__ = pathlib.Path(__path_parc__, "tmp") + + +def create_tmp_dir(): + """Create a temporary directory for testing. + + 1. Remove the ``tmp`` directory if it exists. + 2. Create a new ``tmp`` directory. + + Any data files created during testing will go into ``tmp`` directory. + This is created/removed for each test. + + """ + remove_tmp_dir() + pathlib.Path(__tmp_dir__).mkdir() + + +def remove_tmp_dir(): + """Recursively remove the ``tmp`` directory if it exists.""" + shutil.rmtree(__tmp_dir__, ignore_errors=True) \ No newline at end of file diff --git a/tests/variables.py b/tests/variables.py new file mode 100644 index 0000000..053e3bd --- /dev/null +++ b/tests/variables.py @@ -0,0 +1,42 @@ +import numpy as np + + +NEIGHBOR_ARRAY_L2 = np.array([ + [0, 17], [1, 12], [2, 47], [3, 47], [4, 0], [5, 18], [6, 47], [7, 39], [8, 38], [9, 34], + [10, 48], [11, 7], [12, 1], [13, 38], [14, 33], [15, 33], [16, 10], [17, 0], [18, 5], [19, 21], + [20, 31], [21, 19], [22, 6], [23, 26], [24, 11], [25, 34], [26, 23], [27, 39], [28, 0], + [29, 30], [30, 34], [31, 20], [32, 46], [33, 32], [34, 9], [35, 49], [36, 10], [37, 4], [38, 8], + [39, 7], [40, 17], [41, 8], [42, 38], [43, 26], [44, 46], [45, 1], [46, 19], [47, 2], [48, 10], + [49, 7], [50, 52], [51, 56], [52, 50], [53, 89], [54, 58], [55, 66], [56, 51], [57, 93], + [58, 75], [59, 89], [60, 93], [61, 96], [62, 92], [63, 91], [64, 82], [65, 75], [66, 84], + [67, 92], [68, 87], [69, 80], [70, 138], [71, 97], [72, 133], [73, 63], [74, 97], [75, 65], + [76, 58], [77, 52], [78, 91], [79, 81], [80, 81], [81, 80], [82, 92], [83, 133], [84, 66], + [85, 56], [86, 52], [87, 68], [88, 96], [89, 53], [90, 94], [91, 63], [92, 82], [93, 57], + [94, 99], [95, 96], [96, 95], [97, 74], [98, 57], [99, 96], [100, 136], [101, 142], [102, 125], + [103, 137], [104, 132], [105, 122], [106, 84], [107, 130], [108, 128], [109, 143], [110, 147], + [111, 147], [112, 139], [113, 101], [114, 121], [115, 148], [116, 137], [117, 131], [118, 122], + [119, 72], [120, 143], [121, 142], [122, 105], [123, 126], [124, 120], [125, 129], [126, 123], + [127, 138], [128, 132], [129, 125], [130, 107], [131, 117], [132, 128], [133, 83], [134, 103], + [135, 130], [136, 148], [137, 116], [138, 127], [139, 112], [140, 144], [141, 145], [101, 142], + [143, 120], [144, 140], [145, 141], [146, 123], [147, 110], [148, 136], [149, 127] +]) + +NEIGHBOR_ARRAY_COSINE = np.array([ + [0, 10], [1, 12], [2, 0], [3, 8], [4, 42], [5, 6], [6, 19], [7, 38], [8, 3], [9, 12], + [10, 0], [11, 29], [12, 9], [13, 0], [14, 13], [15, 33], [16, 40], [17, 2], [18, 39], [19, 6], + [20, 34], [21, 6], [22, 33], [23, 26], [24, 29], [25, 20], [26, 18], [27, 39], [28, 49], + [29, 3], [30, 8], [31, 45], [32, 33], [33, 15], [34, 20], [35, 36], [36, 35], [37, 4], [38, 7], + [39, 27], [40, 17], [41, 25], [42, 4], [43, 26], [44, 24], [45, 31], [46, 42], [47, 7], + [48, 10], [49, 28], [50, 74], [51, 99], [52, 86], [53, 54], [54, 52], [55, 63], [56, 59], + [57, 82], [58, 80], [59, 56], [60, 76], [61, 51], [62, 87], [63, 55], [64, 57], [65, 71], + [66, 84], [67, 69], [68, 87], [69, 58], [70, 84], [71, 65], [72, 129], [73, 90], [74, 50], + [75, 74], [76, 60], [77, 89], [78, 94], [79, 93], [80, 58], [81, 80], [82, 50], [83, 116], + [84, 66], [85, 56], [86, 97], [87, 60], [88, 95], [89, 52], [90, 55], [91, 94], [92, 69], + [93, 65], [94, 91], [95, 88], [96, 99], [97, 86], [98, 64], [99, 96], [100, 106], [101, 142], + [102, 111], [103, 83], [104, 142], [105, 108], [106, 121], [107, 105], [108, 122], [109, 115], + [110, 138], [111, 102], [112, 147], [113, 132], [114, 144], [115, 109], [116, 83], [117, 137], + [118, 122], [119, 130], [120, 140], [121, 109], [122, 108], [123, 126], [124, 149], [125, 83], + [126, 139], [127, 138], [128, 132], [129, 133], [130, 119], [131, 55], [132, 113], [133, 125], + [134, 107], [135, 146], [136, 148], [137, 117], [138, 127], [139, 126], [140, 120], [141, 145], + [142, 101], [143, 142], [144, 115], [145, 141], [146, 135], [147, 139], [148, 136], [149, 124] +])