-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBatch.py
109 lines (80 loc) · 3.9 KB
/
Batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
import matplotlib.pyplot as plt
class Batch:
def __init__(self, x_mu: np.ndarray, x_sd: np.ndarray, y_mu: np.ndarray, y_sd: np.ndarray, weights: np.ndarray, colors: np.ndarray, proj: np.ndarray, batch_id: str, n_cells: int=1000, n_genes: int=100, gene_specific: bool=False) -> None:
"""
Parameters:
x_mu: x-coordinates of the components' mean.
x_sd: standard deviation of the components.
y_mu: y-coordinates of the components' mean.
y_sd: standard deviation of the components.
weights: weights of components.
colors: colors of components.
proj: random projection matrix; sample from standard normal distribution to project data to n_genes dimensional space.
batch_id: id of the batch.
n_cells: number of cells (i.e, observations or points)
n_genes: number of genes (i.e, features)
gene_specific: whether to add gene specific noise (batch effect) or not
"""
self.x_mu = x_mu
self.x_sd = x_sd
self.y_mu = y_mu
self.y_sd = y_sd
self.weights = weights
self.colors = colors
self.proj = proj
self.batch_id = batch_id
self.n_cells = n_cells
self.n_genes = n_genes
self.gene_specific = gene_specific
def generate_data(self):
"""
Generate n_cells x 2 dimensional data
"""
n_components = self.weights.size # number of codmponents (i.e, cell types)
component_id = np.arange(0, n_components)
self.component_id = component_id
# Generate a random sample with replacement
components = np.random.choice(component_id, size=self.n_cells, replace=True, p=self.weights) # shows which point is sampled from which component
self.components = components
"""
print("# 0s:", components[components == 0].size)
print("# 1s:", components[components == 1].size)
print("# 2s:", components[components == 2].size)
"""
x_means = [self.x_mu[idx] for idx in components]
y_means = [self.y_mu[idx] for idx in components]
x_sd = [self.x_sd[idx] for idx in components]
y_sd = [self.y_sd[idx] for idx in components]
# Sampling locations for cells in each component
samples = np.column_stack([
np.random.normal(loc=x_means, scale=x_sd, size=self.n_cells), # generate x-coordinates of cells using Gaussian distr
np.random.normal(loc=y_means, scale=y_sd, size=self.n_cells) # generate y-coordinates of cells using Gaussian distr
])
self.samples = samples
def save_clusters(self):
"""
Save plot of n_cells x 2 data
"""
plt.figure() # Create a new figure for each plot to isolate legends
# Create a scatter plot for each cluster
for cluster_label in self.component_id:
cluster_indices = (self.components == cluster_label)
plt.scatter(self.samples[cluster_indices, 0], self.samples[cluster_indices, 1], c=self.colors[cluster_label], label=f'p={self.weights[cluster_label]}', marker='o', s=5)
plt.title(self.batch_id)
plt.legend()
plt.savefig("{}.png".format(self.batch_id))
def project_data(self):
"""
Project data from n_cells x 2 to n_cells x n_genes dimension
"""
# (1000, 2) @ (100, 2).T = (1000, 100)
A = np.dot(self.samples, self.proj.T)
# Add standard normal noise
noise = np.random.normal(size=(self.n_cells, self.n_genes))
A += noise
if self.gene_specific:
# Add some noise (batch effect) to each gene. Noise of particular gene across cells should be the same
gene_noise = np.random.normal(size=self.n_genes)
A += gene_noise # broadcasting
return A