diff --git a/reproducibility/figures/archive/pv/cytotrace.py b/reproducibility/figures/archive/pv/cytotrace.py new file mode 100644 index 000000000..75336bc73 --- /dev/null +++ b/reproducibility/figures/archive/pv/cytotrace.py @@ -0,0 +1,1158 @@ +from time import time + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import scvelo as scv +from numpy import ndarray +from scipy.stats import mannwhitneyu, rankdata + +##from scanorama import correct_scanpy +scv.logging.verbosity = 3 +scv.settings.presenter_view = True +scv.set_figure_params("scvelo") +np.random.seed(99) + +from scipy.sparse import csr_matrix, issparse + +__all__ = [ + "census_normalize", + "remove_zero_mvg", + "compute_similarity2", + "compute_similarity1", + "compute_gcs", + "convert_to_markov", + "any", + "find_nonzero", + "FNNLSa", + "nu", + "eps", + "regressed", + "diffused", + "compare_cytotrace", + "cytotrace", + "batch_cytotrace", + "compare_cytotrace_ncores", + "cytotrace_ncore", + "align_diffrate", + "plot_multiadata", + "cumulative_boxplot", + "run_cytotrace", +] + + +# TODO: refactor to subpackage exposing a minimal public interface +# TODO: migrate all justifiably eager imports to the tops of new modules +# TODO: enable runtime type-checking +# TODO: remove unused comments and print statements +# TODO: add unit tests + + +def census_normalize(mat, count): + "RNA-seq census normalization to correct cell lysis" + if issparse(mat): + # x = mat.copy() + # x.data = 2**x.data - 1 + # gene x cell + x = mat.multiply(count).multiply(1.0 / mat.sum(axis=0)).tocsr() + return x.log1p() + else: + x = 2**mat - 1 + # gene x cell + x = (x * count) / x.sum(axis=0) + return np.log2(x + 1) + + +def remove_zero_mvg(mat): + "remove cells not expressing any of the top 1000 variable genes" + # gene x cell + cell_count = (mat > 0).sum(axis=1) + X_topcell = mat[cell_count >= 0.05 * mat.shape[1], :] + var = X_topcell.var(axis=1) + mean = X_topcell.mean(axis=1) + disp = var / mean + return X_topcell[ + disp >= (np.sort(disp)[::-1])[:1000][-1], : + ] # in case less than 1000 genes + + +def compute_similarity2(O: ndarray, P: ndarray) -> ndarray: + "Compute pearson correlation between two matrices O and P using einstein summation" + (n, t) = O.shape # n traces of t samples + (n_bis, m) = P.shape # n predictions for each of m candidates + + DO = O - ( + np.einsum("nt->t", O, optimize="optimal") / np.double(n) + ) # compute O - mean(O) + DP = P - ( + np.einsum("nm->m", P, optimize="optimal") / np.double(n) + ) # compute P - mean(P) + + cov = np.einsum("nm,nt->mt", DP, DO, optimize="optimal") + + varP = np.einsum("nm,nm->m", DP, DP, optimize="optimal") + varO = np.einsum("nt,nt->t", DO, DO, optimize="optimal") + tmp = np.einsum("m,t->mt", varP, varO, optimize="optimal") + return cov / np.sqrt(tmp) + + +# adapt from https://github.com/ikizhvatov/efficient-columnwise-correlation/blob/master/columnwise_corrcoef_perf.py +def compute_similarity1(A): + "Compute pairwise correlation of all columns in matrices A" + n, t = A.shape # genes x samples + DO = A - ( + np.einsum("nt->t", A, optimize="optimal") / np.double(n) + ) # compute O - mean(O) + cov = np.einsum("nm,nt->mt", DO, DO, optimize="optimal") + varP = np.einsum("nt,nt->t", DO, DO, optimize="optimal") + return cov / np.sqrt(np.einsum("m,t->mt", varP, varP, optimize="optimal")) + + +def compute_gcs(mat, count, top_n_genes=200): + "Compute gene set enrichment scores by correlating gene count and gene expression" + corrs = compute_similarity2(mat.T, count.reshape(-1, 1))[0, :] + # avoid nan, put all nan to -1 + corrs[np.isnan(corrs)] = -1 + gcs = mat[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0) + return gcs + + +def convert_to_markov(sim): + """Convert the Pearson correlation to Markov matrix + + TODO: use velocity graph to replace this markov matrix + """ + Ds = np.copy(sim) + + cutoff = np.mean(Ds) + np.fill_diagonal(Ds, 0) + + Ds[Ds <= 0] = 0 + Ds[Ds <= cutoff] = 0 + + zero_rows = Ds.sum(axis=1) == 0 + zero_cols = Ds.sum(axis=0) == 0 + + Ds[~zero_rows, :] = ( + Ds[~zero_rows, :].T / Ds[~zero_rows, :].sum(axis=1) + ).T # tips + Ds[:, zero_cols] = 0 + return Ds + + +# http://xrm.phys.northwestern.edu/research/pdf_papers/1997/bro_chemometrics_1997.pdf +# matlab reference: https://www.mathworks.com/matlabcentral/mlc-downloads/downloads/submissions/3388/versions/1/previews/fnnls.m/index.html + +from numpy import ( + abs, + arange, + argmax, + finfo, + float64, + int64, + min, + newaxis, + nonzero, + sum, + zeros, +) +from scipy.linalg import solve + +nu = newaxis +import numpy as np + +# machine epsilon +eps = finfo(float64).eps + + +def any(a): + # assuming a vector, a + larger_than_zero = sum(a > 0) + if larger_than_zero: + return True + else: + return False + + +def find_nonzero(a): + # returns indices of nonzero elements in a + return nonzero(a)[0] + + +def FNNLSa(XtX, Xty, tol=None): + """Faster NNLS imported from https://github.com/delnatan/FNNLSa + A fast non-negativity-constrained least squares algorithm. Journal of chemometrics + """ + + if tol is None: + tol = eps + + M, N = XtX.shape + # initialize passive set, P. Indices where coefficient is >0 + P = zeros(N, dtype=int64) + # and active set. Indices where coefficient is <=0 + Z = arange(N) + 1 + # working active set + ZZ = arange(N) + 1 + # initial solution vector, x + x = zeros(N, dtype=float64) + # weight vector + w = Xty - XtX @ x + # iteration counts and parameter + it = 0 + itmax = 30 * N + # MAIN LOOP + # continue as long as there are indices within the active set Z + # or elements in inner loop active set is larger than 'tolerance' + piter = 0 + while any(Z) and any(w[ZZ - 1] > tol): + piter += 1 + t = argmax(w[ZZ - 1]) + 1 # find largest weight + t = ZZ[t - 1] + P[t - 1] = t # move to passive set + Z[t - 1] = 0 # remove from active set + PP = find_nonzero(P) + 1 + ZZ = find_nonzero(Z) + 1 + NZZ = ZZ.shape + + # compute trial solution, s + s = zeros(N, dtype=float64) + + if len(PP) == 1: + s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1] + else: + s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1]) + s[ZZ - 1] = 0.0 # set active coefficients to 0 + + while any(s[PP - 1] <= tol) and it < itmax: + it = it + 1 + QQ = find_nonzero((s <= tol) * P) + 1 + alpha = min(x[QQ - 1] / (x[QQ - 1] - s[QQ - 1])) + x = x + alpha * (s - x) + ij = find_nonzero((abs(x) < tol) * (P != 0)) + 1 + Z[ij - 1] = ij + P[ij - 1] = 0 + PP = find_nonzero(P) + 1 + ZZ = find_nonzero(Z) + 1 + if len(PP) == 1: + s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1] + else: + s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1]) + s[ZZ - 1] = 0.0 + # assign current solution, s, to x + x = s + # recompute weights + w = Xty - XtX @ x + return x, w + + +def regressed(markov, gcs, solver="fnnls"): + """solve markov @ weight = gcs problems, + + solver: fnnls (default) is faster in larger dataset, e.g., above 20,000 cells + nnls is faster in smaller dataset, e.g. less than 5,000 cells + """ + print(solver) + if solver == "fnnls": + coef, err = FNNLSa(markov.T @ markov, markov.T @ gcs) + elif ( + solver == "jfnnls" + ): # pip install fnnls # https://github.com/jvendrow/fnnls + from fnnls import fnnls + + coef, err = fnnls(markov, gcs) + elif solver == "nnls": + from scipy.optimize import nnls + + print((markov > 0).sum()) + coef, err = nnls(markov, gcs) # from 1987... + elif solver == "lsq_linear": + from scipy import sparse + from scipy.optimize import lsq_linear + + # slow as well + sol = lsq_linear( + sparse.csr_matrix(markov), + gcs, + bounds=(0, np.inf), + lsmr_tol="auto", + verbose=1, + ) # from 1997... + coef = sol.x + elif solver == "lasso": + from scipy import sparse + from sklearn.linear_model import Lasso + + lasso = Lasso( + alpha=1e-7, max_iter=100000, fit_intercept=False, positive=True + ) + den = (markov > 0).sum() / np.prod(markov.shape) + print(den) + if den >= 0.5: + lasso.fit(markov, gcs) + else: + # lasso.fit(sparse.csr_matrix(markov), gcs) + lasso.fit(sparse.coo_matrix(markov), gcs) + coef = lasso.coef_ + elif solver == "nnlsgd": # somehow not consistent with nnls and fnnls... + coef = NNLS_GD(markov.T @ markov, markov.T @ gcs) + elif solver == None: # do not fit + return gcs + else: # tnt-nn + raise NotImplementedError + # faster nnls + return np.dot(markov, coef) + + +def diffused(markov, gcs, ALPHA=0.9): + """Compute diffusion process""" + v_prev = np.copy(gcs) + v_curr = np.copy(gcs) + + for i in range(10000): + v_prev = np.copy(v_curr) + v_curr = ALPHA * (markov.dot(v_curr)) + (1 - ALPHA) * gcs + + diff = np.mean(np.abs(v_curr - v_prev)) + if diff <= 1e-6: + break + return v_curr + + +from scipy.sparse import issparse + + +def compare_cytotrace( + adata, + layer="all", + cell_count=10, + condition="age", + solver="nnls", + is_normalized=False, + n_cores=4, + top_n_genes=200, +): + "Main interface of cytotrace reimplementation used for single dataset with multiple conditions" + # condition common steps + + proc_time = time() + if not is_normalized: + if layer == "all": + X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray() + else: + try: + X = ( + adata.layers[layer].toarray() + if issparse(adata.layers[layer]) + else adata.layers[layer] + ) + except: + X = adata.X.toarray() if issparse(adata.X) else adata.X + + cells_selected = np.arange(X.shape[0]) + genes_selected = np.arange(X.shape[1]) + pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0) + # cell x gene + X = X[:, ~pgenes] + X = (X.T / X.sum(axis=1)) * 1e6 + + pqcells = (pd.isnull((X > 0).sum(axis=0))) | ( + (X > 0).sum(axis=0) <= cell_count + ) + X = X[:, ~pqcells] + + genes_selected = genes_selected[~pgenes] + cells_selected = cells_selected[~pqcells] + + X = np.log2(X + 1) + counts = (X > 0).sum(axis=0) + mat2 = census_normalize(X, counts) + + mvg = remove_zero_mvg(mat2) + cells_selected = cells_selected[mvg.sum(axis=0) != 0] + + mat2 = mat2[:, mvg.sum(axis=0) != 0] + counts = counts[mvg.sum(axis=0) != 0] + + # joint two conditions to determine gcs feature + # using the same set of genes + gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes) + adata.uns["gcs"] = gcs + adata.uns["cells_selected"] = cells_selected + adata.uns["counts"] = counts + adata.uns["mat2"] = mat2 + adata.uns["mvg"] = mvg + adata.uns["genes_selected"] = genes_selected + else: + cells_selected = adata.uns["cells_selected"] + counts = adata.uns["counts"] + mat2 = adata.uns["mat2"] + mvg = adata.uns["mvg"] + genes_selected = adata.uns["genes_selected"] + gcs = adata.uns["gcs"] + + # for indepent gcs feature + # using different set of genes is very variable + # gcs = np.zeros(len(cells_selected)) + + # condition-specific steps + for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]): + selection = adata.obs.loc[:, condition].values[cells_selected] == cond + + gcs_cond = gcs[selection] + counts_cond = counts[selection] + mat2_cond = mat2[:, selection] + + # compute gcs feature independently for two conditions + # this would select different gene set, and make the + # cytotrace score incomparable + # gcs_cond = compute_gcs(mat2_cond, counts_cond) + + mvg_cond = mvg[:, selection] + + trans_time = time() + print("preprocessing: %s" % (trans_time - proc_time)) + # get transition matrix + corr_cond = compute_similarity1(mvg_cond) + markov_cond = convert_to_markov(corr_cond) + + gc_time = time() + print("markov: %s" % (gc_time - trans_time)) + + # get Gene count signature + gcs_cond = regressed(markov_cond, gcs_cond, solver=solver) + gcs_cond = diffused(markov_cond, gcs_cond) + gcs[selection] = gcs_cond + + # condition specific cytotrace correlated gene sets + scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0] + score_time = time() + print("score time: %s" % (score_time - gc_time)) + + corrs_cond = compute_similarity2( + mat2_cond.T, scores_cond.reshape(-1, 1) + )[0, :] + final_time = time() + print("final time: %s" % (final_time - score_time)) + adata.var["cytotrace_corrs_%s" % cond] = None + adata.var.iloc[ + genes_selected, + adata.var.columns.get_loc("cytotrace_corrs_%s" % cond), + ] = corrs_cond + + adata.obs["gcs"] = None + adata.obs["cytotrace"] = None + adata.obs["counts"] = None + adata.obs.iloc[ + cells_selected, adata.obs.columns.get_loc("gcs") + ] = gcs # used for re-rank when integrating multiple-samples + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = ( + rankdata(gcs) / gcs.shape[0] + ) # cytotrace scores + adata.obs.iloc[ + cells_selected, adata.obs.columns.get_loc("counts") + ] = counts # used for re-rank when integrating multiple-samples + + +def cytotrace_sparse( + adata, + layer="raw", + cell_count=20, + solver="nnls", + top_n_features=200, + skip_regress=False, +): + "optimized version" + proc_time = time() + if not issparse(adata.layers[layer]): + raise NotImplementedError + else: + X = adata.layers[layer] + + cells_selected = np.arange(X.shape[0]) + features_selected = np.arange(X.shape[1]) + + n_cells = X.shape[0] + feature_mean = (X.sum(0) / n_cells).A1 + feature_mean_sq = (X.multiply(X).sum(0) / n_cells).A1 + feature_var = (feature_mean_sq - feature_mean**2) * ( + n_cells / (n_cells - 1) + ) + non_zero_features = (X > 0).sum(axis=0).A1 + pfeatures = np.isnan(non_zero_features) | (feature_var == 0) + + # cell x feature, filter feature + X = X[:, ~pfeatures] + features_selected = features_selected[~pfeatures] + + # gene x cell + X = X.T + X = X.multiply(1.0 / csr_matrix.sum(X, axis=0)) * 1e6 + X = X.tocsr() + + # filter cell, feature x cell + feature_count_per_cell = (X > 0).sum(axis=0).A1 + pcells = np.isnan(feature_count_per_cell) | ( + feature_count_per_cell < cell_count + ) + + # feature x cell + X = X[:, ~pcells] + cells_selected = cells_selected[~pcells] + feature_count_per_cell = feature_count_per_cell[~pcells] + + # census normalize, feature x cell + census_X = ( + X.multiply(feature_count_per_cell) + .multiply(1.0 / csr_matrix.sum(X, axis=0)) + .log1p() + .tocsr() + ) + + # top variable features, feature x cell + cell_count_per_feature = (census_X > 0).sum(axis=1).A1 + census_X_topcell = census_X[ + cell_count_per_feature >= 0.05 * census_X.shape[1], : + ].tocsr() + n_cells = census_X_topcell.shape[1] + feature_mean = (census_X_topcell.sum(1) / n_cells).A1 + feature_mean_sq = ( + census_X_topcell.multiply(census_X_topcell).sum(1) / n_cells + ).A1 + feature_var = (feature_mean_sq - feature_mean**2) * ( + n_cells / (n_cells - 1) + ) + disp = feature_var / feature_mean + mvg = census_X_topcell[ + disp >= (np.sort(disp)[::-1])[:1000][-1], : + ] # in case less than 1000 features + + # top 1000 variable features for markov matrix + selection = (mvg.sum(axis=0) != 0).A1 + mvg = mvg[:, selection] + corr = compute_similarity1(mvg.A) + markov = convert_to_markov(corr) + + # filter census output with nonzero cells in top variable features matrix + cells_selected = cells_selected[selection] + census_X = census_X[:, selection] + feature_count_per_cell = feature_count_per_cell[selection] + + # calculate GC scores + corrs = compute_similarity2( + census_X.T.A, feature_count_per_cell.reshape(-1, 1) + )[0, :] + # avoid nan, put all nan to -1 + corrs[np.isnan(corrs)] = -1 + gcs = census_X[np.argsort(corrs)[::-1][:top_n_features], :].mean(axis=0).A1 + + if not skip_regress: + # print("regress...") + # from scipy.optimize import lsq_linear + # # slow as well + # sol = lsq_linear(csr_matrix(markov), gcs, bounds=(0, np.inf), lsmr_tol='auto', verbose=1) # from 1997... + # coef = sol.x + + from scipy.optimize import nnls + + coef, err = nnls(markov, gcs) # from 1987... + gcs = np.dot(markov, coef) + + # print(markov.shape, gcs.shape) + gcs = diffused(markov, gcs) + rank = rankdata(gcs) + scores = rank / gcs.shape[0] + + # print(gcs) + adata.obs["gcs"] = np.nan + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs + + cytoGenes = compute_similarity2(census_X.T.A, scores.reshape(-1, 1))[0, :] + + adata.obs["cytotrace"] = np.nan + adata.obs.iloc[ + cells_selected, adata.obs.columns.get_loc("cytotrace") + ] = scores + + adata.obs["counts"] = np.nan + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = gcs + + adata.var["cytotrace"] = False + adata.var.iloc[ + features_selected, adata.var.columns.get_loc("cytotrace") + ] = True + + adata.var["cytotrace_corrs"] = np.nan + adata.var.iloc[ + features_selected, adata.var.columns.get_loc("cytotrace_corrs") + ] = np.array(corrs, dtype=np.float32) + return { + "CytoTRACE": scores, + "CytoTRACErank": rank, + "GCS": gcs, + "Counts": feature_count_per_cell, + "exprMatrix": census_X, + "cytoGenes": cytoGenes, + "gcsGenes": corrs, + "filteredCells": np.setdiff1d( + np.arange(adata.shape[0]), cells_selected + ), + } + + +def cytotrace( + adata, layer="all", cell_count=10, solver="nnls", top_n_genes=200 +): + "Main interface of cytotrace reimplementation used for single dataset with one condition" + proc_time = time() + if layer == "all": + X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray() + # X = (adata.layers['spliced'] + adata.layers['unspliced']) + else: + try: + X = ( + adata.layers[layer].toarray() + if issparse(adata.layers[layer]) + else adata.layers[layer] + ) + # X = adata.layers[layer] if issparse(adata.layers[layer]) else adata.layers[layer] + except: + X = adata.X.toarray() if issparse(adata.X) else adata.X + # X = adata.X if issparse(adata.X) else adata.X + + cells_selected = np.arange(X.shape[0]) + genes_selected = np.arange(X.shape[1]) + + pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0) + # cell x gene + X = X[:, ~pgenes] + # gene x cell + X = (X.T / X.sum(axis=1)) * 1e6 + + pqcells = (pd.isnull((X > 0).sum(axis=0))) | ( + (X > 0).sum(axis=0) <= cell_count + ) + X = X[:, ~pqcells] + + genes_selected = genes_selected[~pgenes] + cells_selected = cells_selected[~pqcells] + + X = np.log2(X + 1) + counts = (X > 0).sum(axis=0) + mat2 = census_normalize(X, counts) + + mvg = remove_zero_mvg(mat2) + selection = mvg.sum(axis=0) != 0 + mvg = mvg[:, selection] + mat2 = mat2[:, selection] + counts = counts[selection] + + cells_selected = cells_selected[selection] + + trans_time = time() + print("processing: %s" % (trans_time - proc_time)) + + # get transition matrix + corr = compute_similarity1(mvg) + markov = convert_to_markov(corr) + + gc_time = time() + print("markov: %s" % (gc_time - trans_time)) + + # get Gene count signature + # gcs = compute_gcs(mat2, counts) + # gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes) + + corrs = compute_similarity2(mat2.T, counts.reshape(-1, 1))[0, :] + # avoid nan, put all nan to -1 + corrs[np.isnan(corrs)] = -1 + gcs = mat2[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0) + + gcs_time = time() + print("gcs: %s" % (gcs_time - gc_time)) + + gcs = regressed(markov, gcs, solver=solver) + regress_time = time() + print("regression: %s" % (regress_time - gcs_time)) + + gcs = diffused(markov, gcs) + rank = rankdata(gcs) + scores = rankdata(gcs) / gcs.shape[0] + score_time = time() + print("diffusion:%s " % (score_time - regress_time)) + + cyto_corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :] + gene_time = time() + print("genes:%s" % (gene_time - score_time)) + + adata.obs["gcs"] = None + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs + + adata.obs["cytotrace"] = None + adata.obs.iloc[ + cells_selected, adata.obs.columns.get_loc("cytotrace") + ] = scores + + adata.obs["counts"] = None + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts + + adata.var["cytotrace"] = None + adata.var.iloc[ + genes_selected, adata.var.columns.get_loc("cytotrace") + ] = True + + adata.var["cytotrace_corrs"] = None + adata.var.iloc[ + genes_selected, adata.var.columns.get_loc("cytotrace_corrs") + ] = cyto_corrs + return { + "CytoTRACE": scores, + "CytoTRACErank": rank, + "GCS": gcs, + "Counts": counts, + "exprMatrix": mat2, + "cytoGenes": cyto_corrs, + "gcsGenes": corrs, + "filteredCells": np.setdiff1d( + np.arange(adata.shape[0]), cells_selected + ), + } + + +def visualize(adata, metrics, name="test"): + import scanpy as sc + import seaborn as sns + + fig, ax = plt.subplots(2) + fig.set_size_inches(8, 6) + sc.pl.umap(adata, color="cytotrace", show=False, ax=ax[0]) + order = ( + adata.obs.groupby("clusters").median().sort_values(["cytotrace"]).index + ) + bplot = sns.boxplot( + x="clusters", + y="cytotrace", + data=adata.obs, + order=order, + width=0.35, + ax=ax[1], + ) + ax[1].set_xticklabels(order, rotation=40, ha="right") + fig.savefig(f"{name}_figure.pdf") + + +def batch_cytotrace(mvg_batch, gcs_batch, solver="jfnnls"): + corr_batch = compute_similarity1(mvg_batch) + markov_batch = convert_to_markov(corr_batch) + gcs_time = time() + gcs_batch = regressed(markov_batch, gcs_batch, solver=solver) + regress_time = time() + print("regression: %s" % (regress_time - gcs_time)) + print(markov_batch.shape, gcs_batch.shape) + gcs_batch = diffused(markov_batch, gcs_batch) + return gcs_batch + + +from scipy.sparse import issparse + + +def compare_cytotrace_ncores( + adata, + layer="all", + cell_count=10, + condition="age", + solver="nnls", + is_normalized=False, + ncores=4, + batch_cell=2000, +): + "Main interface of cytotrace reimplementation used for single dataset with multiple conditions" + # condition common steps + + proc_time = time() + if not is_normalized: + if layer == "all": + X = ( + adata.layers["spliced"] + adata.layers["unspliced"] + ) # .toarray() + else: + try: + X = ( + adata.layers[layer] + if issparse(adata.layers[layer]) + else adata.layers[layer] + ) + except: + X = adata.X if issparse(adata.X) else adata.X + + cells_selected = np.arange(X.shape[0]) + genes_selected = np.arange(X.shape[1]) + + from sklearn.utils import sparsefuncs_fast + + mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X) + + pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0) + + # cell x gene + X = X[:, ~pgenes] + X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr() + pqcells = np.array( + (np.isnan((X > 0).sum(axis=0))) + | ((X > 0).sum(axis=0) <= cell_count) + )[0] + X = X[:, ~pqcells] + + genes_selected = genes_selected[~pgenes] + cells_selected = cells_selected[~pqcells] + + counts = np.array((X > 0).sum(axis=0))[0] + mat2 = census_normalize(X, counts).toarray() + mvg = remove_zero_mvg(mat2) + + # sel = np.array(mvg.sum(axis=0))[0]!=0 + sel = mvg.sum(axis=0) != 0 + mat2 = mat2[:, sel] + counts = counts[sel] + cells_selected = cells_selected[sel] + + # joint two conditions to determine gcs feature + # using the same set of genes + gcs = compute_gcs(mat2, counts) + adata.uns["gcs"] = gcs + adata.uns["cells_selected"] = cells_selected + adata.uns["counts"] = counts + adata.uns["mat2"] = mat2 + adata.uns["mvg"] = mvg + adata.uns["genes_selected"] = genes_selected + else: + cells_selected = adata.uns["cells_selected"] + counts = adata.uns["counts"] + mat2 = adata.uns["mat2"] + mvg = adata.uns["mvg"] + genes_selected = adata.uns["genes_selected"] + gcs = adata.uns["gcs"] + + # for indepent gcs feature + # using different set of genes is very variable + # gcs = np.zeros(len(cells_selected)) + + # parallel part + import multiprocessing as mp + + # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map + # condition-specific steps + pool = mp.Pool(ncores) + + for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]): + print(cond) + selection = adata.obs.loc[:, condition].values[cells_selected] == cond + + gcs_cond = gcs[selection] + counts_cond = counts[selection] + mat2_cond = mat2[:, selection] + mvg_cond = mvg[:, selection] + + mvg_gcs_batches = [] + for i in range(0, gcs_cond.shape[0], batch_cell): + mvg_batch = mvg_cond[:, i : (i + batch_cell)].copy() + gcs_batch = gcs_cond[i : i + batch_cell].copy() + print(mvg_batch.shape, gcs_batch.shape) + mvg_gcs_batches.append([mvg_batch, gcs_batch, solver]) + + # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map + # http://jennguyen1.github.io/nhuyhoa/software/Parallel-Processing.html + result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches) + result.wait() + + gcs_batches = result.get() + # gcs_batches = result.get() + gcs_cond = np.hstack(gcs_batches) + + trans_time = time() + print("preprocessing: %s" % (trans_time - proc_time)) + + gc_time = time() + print("markov: %s" % (gc_time - trans_time)) + gcs[selection] = gcs_cond + + # condition specific cytotrace correlated gene sets + scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0] + score_time = time() + print("score time: %s" % (score_time - gc_time)) + + corrs_cond = compute_similarity2( + mat2_cond.T, scores_cond.reshape(-1, 1) + )[0, :] + final_time = time() + print("final time: %s" % (final_time - score_time)) + adata.var["cytotrace_corrs_%s" % cond] = None + adata.var.iloc[ + genes_selected, + adata.var.columns.get_loc("cytotrace_corrs_%s" % cond), + ] = corrs_cond + + pool.close() + pool.join() + + adata.obs["gcs"] = None + adata.obs["cytotrace"] = None + adata.obs["counts"] = None + adata.obs.iloc[ + cells_selected, adata.obs.columns.get_loc("gcs") + ] = gcs # used for re-rank when integrating multiple-samples + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = ( + rankdata(gcs) / gcs.shape[0] + ) # cytotrace scores + adata.obs.iloc[ + cells_selected, adata.obs.columns.get_loc("counts") + ] = counts # used for re-rank when integrating multiple-samples + + +def cytotrace_ncore( + adata, + layer="all", + cell_count=10, + solver="nnls", + ncores=4, + batch_cell=3000, + shuffle=3, +): + "optimized version" + proc_time = time() + if layer == "all": + X = adata.layers["spliced"] + adata.layers["unspliced"] + else: + try: + X = ( + adata.layers[layer] + if issparse(adata.layers[layer]) + else adata.layers[layer] + ) + except: + X = adata.X if issparse(adata.X) else adata.X + + cells_selected = np.arange(X.shape[0]) + genes_selected = np.arange(X.shape[1]) + + from sklearn.utils import sparsefuncs_fast + + mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X) + + pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0) + # pgenes = np.isnan(mean * adata.shape[0]) | (var == 0) + + # cell x gene + X = X[:, ~pgenes] + # gene x cell + # X = (X.T / np.array(X.sum(axis=1))[0]) * 1e6 + # sparse operation + X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr() + + pqcells = np.array( + (np.isnan((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count) + )[0] + X = X[:, ~pqcells] + + genes_selected = genes_selected[~pgenes] + cells_selected = cells_selected[~pqcells] + + counts = np.array((X > 0).sum(axis=0))[0] + mat2 = census_normalize(X, counts).toarray() + + # cell_count = np.array((mat2 > 0).sum(axis=1))[:, 0] + # X_topcell = mat2[cell_count >= 0.05 * mat2.shape[1], :] + + # mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X_topcell) + # disp = var / mean + # mvg = X_topcell[disp >= (np.sort(disp)[::-1])[:1000][-1], :] + mvg = remove_zero_mvg(mat2) + + # sel = np.array(mvg.sum(axis=0))[0]!=0 + sel = mvg.sum(axis=0) != 0 + mat2 = mat2[:, sel] + counts = counts[sel] + cells_selected = cells_selected[sel] + + # mat2 = mat2.toarray() + # mvg = mvg.toarray() + print(mvg.shape, mat2.shape) + + trans_time = time() + print("processing: %s" % (trans_time - proc_time)) + + gcs = compute_gcs(mat2, counts) + + # get transition matrix + # parallel part + import multiprocessing as mp + + pool = mp.Pool(ncores) + + # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map + mvg_gcs_batches = [] + for i in range(0, adata.shape[0], batch_cell): + mvg_batch = mvg[:, i : (i + batch_cell)].copy() + gcs_batch = gcs[i : i + batch_cell].copy() + mvg_gcs_batches.append([mvg_batch, gcs_batch, solver]) + + markov_diffusion_time = time() + + # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map + result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches) + pool.close() + pool.join() + score_time = time() + print(score_time - markov_diffusion_time) + gcs_batches = result.get() + gcs_batches = np.hstack(gcs_batches) + + scores = rankdata(gcs_batches) / gcs_batches.shape[0] + corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :] + + gene_time = time() + print("genes:%s" % (gene_time - score_time)) + adata.obs["gcs"] = None + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs + + adata.obs["cytotrace"] = None + adata.obs.iloc[ + cells_selected, adata.obs.columns.get_loc("cytotrace") + ] = scores + + adata.obs["counts"] = None + adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts + + adata.var["cytotrace"] = None + adata.var.iloc[ + genes_selected, adata.var.columns.get_loc("cytotrace") + ] = True + + adata.var["cytotrace_corrs"] = None + adata.var.iloc[ + genes_selected, adata.var.columns.get_loc("cytotrace_corrs") + ] = corrs + + +def align_diffrate( + adatas, labels, field="condition", type="A", outfield="cytotrace", ax=None +): + "this is used for differentiation rate comparison across samples" + scores = [ + adatas[0].obs.loc[:, outfield][adatas[0].obs.loc[:, field] == type], + adatas[1].obs.loc[:, outfield][adatas[1].obs.loc[:, field] == type], + ] + if outfield in ["latent_time", "velocity_pseudotime"]: + pvalg = mannwhitneyu(scores[0], scores[1], alternative="greater")[1] + pvall = mannwhitneyu(scores[0], scores[1], alternative="less")[1] + else: + pvalg = mannwhitneyu( + 1 - scores[0], 1 - scores[1], alternative="greater" + )[1] + pvall = mannwhitneyu(1 - scores[0], 1 - scores[1], alternative="less")[ + 1 + ] + + for s, l in zip(scores, labels): + if outfield in ["latent_time", "velocity_pseudotime"]: + s = np.sort(s) + else: + if outfield != "cytotrace": + s = np.sort(-s) + else: + s = np.sort(1 - s) + ax.step( + np.concatenate([s, s[[-1]]]), + np.arange(s.size + 1) / s.size, + label=l, + ) + ax.scatter( + np.concatenate([s, s[[-1]]]), np.arange(s.size + 1) / s.size, s=5 + ) + if outfield == "cytotrace": + ax.set_xlabel("Differentiation level") + else: + ax.set_xlabel(outfield) + ax.set_ylabel("Cumulative proportion of cells") + ax.set_title(type) + ax.text( + np.percentile(s, 0.6), + 0.4, + f"{labels[0]} > {labels[1]}: {pvalg:.3E}", + ) + ax.text( + np.percentile(s, 0.6), + 0.48, + f"{labels[0]} < {labels[1]}: {pvall:.3E}", + ) + ax.legend() + + +def plot_multiadata(adatas): + for a in adatas: + print(a.obs.age[0]) + fig, ax = plt.subplots(1, 5) + fig.set_size_inches(32, 9) + scv.pl.velocity_embedding_grid( + a, + scale=0.2, + color="louvain", + show=False, + basis="pca", + legend_loc="on data", + ax=ax[0], + title=a.obs.age[0], + ) + scv.pl.scatter( + a, + color="velocity_pseudotime", + basis="pca", + size=80, + color_map="gnuplot", + ax=ax[1], + show=False, + title=a.obs.age[0], + ) + scv.pl.scatter( + a, + color="latent_time", + basis="pca", + size=80, + color_map="gnuplot", + ax=ax[2], + show=False, + title=a.obs.age[0], + ) + scv.pl.scatter( + a, + color="end_points", + basis="pca", + size=80, + color_map="gnuplot", + ax=ax[3], + show=False, + title=a.obs.age[0], + ) + scv.pl.scatter( + a, + color="root_cells", + basis="pca", + size=80, + color_map="gnuplot", + ax=ax[4], + show=False, + title=a.obs.age[0], + ) + + +# https://stackoverflow.com/questions/15408371/cumulative-distribution-plots-python +def cumulative_boxplot(): + data = adata_copy[adata_copy.obs.age == "A",].obs.n_counts + # evaluate the histogram + values, base = np.histogram(data, bins=10) + # evaluate the cumulative + cumulative = np.cumsum(values) + # plot the cumulative function + plt.scatter(base[:-1], cumulative / cumulative[-1], c="blue") + # plot the survival function + # plt.plot(base[:-1], len(data)-cumulative, c='green')