-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathgenome.py
240 lines (191 loc) · 6.47 KB
/
genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# genome.py : synthesizing DNA sequences
# data: https://www.kaggle.com/code/tarunsolanki/classifying-dna-sequence-using-ml
import pandas as pd
import numpy as np
import re # for regular expressions
#--- [1] Read data
url = "https://raw.githubusercontent.com/VincentGranville/Main/main/dna_human.txt"
human = pd.read_table(url)
# human = pd.read_table('dna_human.txt')
print(human.head())
#--- [2] Build hash table architecture
#
# hash1_list[string1] is the list of potential string2 found after string1, separated by ~
nobs = len(human)
print(nobs)
hash12 = {}
hash1_list = {}
hash1 = {}
hash2 = {}
count1 = 0
count2 = 0
count12 = 0
sequence = ''
for k in range(nobs):
obs = human['sequence'][k]
sequence += obs
sequence += 'N'
type = human['class'][k]
length = len(obs)
string1_length = 4
string2_length = 2
pos0 = 0
pos1 = pos0 + string1_length
pos2 = pos1 + string2_length
while pos2 < length:
string1 = obs[pos0:pos1]
string2 = obs[pos1:pos2]
if string1 in hash1:
if string2 not in hash1_list[string1] and 'N' not in string2:
hash1_list[string1] = hash1_list[string1] + '~' + string2
hash1[string1] += 1
count1 += 1
elif 'N' not in string1:
hash1_list[string1] = '~' + string2
hash1[string1] = 1
key = (string1, string2)
if string2 in hash2:
hash2[string2] += 1
count2 += 1
elif 'N' not in string2:
hash2[string2] = 1
if key in hash12:
hash12[key] += 1
count12 += 1
elif 'N' not in string1 and 'N' not in string2:
hash12[key] = 1
pos0 += 1
pos1 += 1
pos2 += 1
if k % 100 == 0:
print("Creating hash tables: %6d %6d %4d" %(k, length, type))
#--- [3] Create table of string associations, compute PMI metric
print()
index = 0
for key in hash12:
index +=1
string1 = key[0]
string2 = key[1]
n1 = hash1[string1] # occurrences of string1
n2 = hash2[string2] # occurrences of string2
n12 = hash12[key] # occurrences of (string1, string2)
p1 = n1 / count1 # frequency of string1
p2 = n2 / count2 # frequency of string2
p12 = n12 / count12 # frequency of (string1, string2)
pmi = p12 / (p1 * p2)
if index % 100 == 0:
print("Computing string frequencies: %5d %4s %2s %4d %8.5f"
%(index, string1, string2, n12, pmi))
print()
#--- [4] Synthetization
#
# synthesizing word2, one at a time, sequencially based on previous word1
n_synthetic_string2 = 2000000
seed = 65
np.random.seed(seed)
synthetic_sequence = 'TTGT' # starting point (must be existing string1)
pos1 = len(synthetic_sequence)
pos0 = pos1 - string1_length
for k in range(n_synthetic_string2):
string1 = synthetic_sequence[pos0:pos1]
string = hash1_list[string1]
myList = re.split('~', string)
# get target string2 list in arr_string2, and corresponding probabilities in arr_proba
arr_string2 = []
arr_proba = []
cnt = 0
for j in range(len(myList)):
string2 = myList[j]
if string2 in hash2:
key = (string1, string2)
cnt += hash12[key]
arr_string2.append(string2)
arr_proba.append(hash12[key])
arr_proba = np.array(arr_proba)/cnt
# build cdf and sample word2 from cdf, based on string1
u = np.random.uniform(0, 1)
cdf = arr_proba[0]
j = 0
while u > cdf:
j += 1
cdf += arr_proba[j]
synthetic_string2 = arr_string2[j]
synthetic_sequence += synthetic_string2
if k % 100000 == 0:
print("Synthesizing %7d/%7d: %4d %8.5f %2s"
% (k, n_synthetic_string2, j, u, synthetic_string2))
pos0 += string2_length
pos1 += string2_length
print()
print("Real DNA:\n", sequence[0:1000])
print()
print("Synthetic DNA:\n", synthetic_sequence[0:1000])
print()
#--- [5] Create random sequence for comparison purposes
print("Creating random sequence...")
length = (1 + n_synthetic_string2) * string2_length
random_sequence = ""
map = ['A', 'C', 'T', 'G']
for k in range(length):
random_sequence += map[np.random.randint(4)]
if k % 100000 == 0:
print("Creating random sequence: %7d/%7d" %(k,length))
print()
print("Random DNA:\n", random_sequence[0:1000])
print()
#--- [6] Evaluate quality: real vs synthetic vs random DNA
max_nodes = 10000 # sample strings for frequency comparison
string_length = 6 # length of sample strings (fixed length here)
nodes = 0
hnodes = {}
iter = 0
while nodes < max_nodes and iter < 5*max_nodes:
index = np.random.randint(0, len(sequence)-string_length)
string = sequence[index:index+string_length]
iter += 1
if string not in hnodes and 'N' not in string:
hnodes[string] = True
nodes += 1
if nodes % 1000 == 0:
print("Building nodes: %6d/%6d" %(nodes, max_nodes))
print()
def compute_HD(hnodes, sequence, synthetic_sequence):
pdf1 = []
pdf2 = []
cc = 0
for string in hnodes:
cnt1 = sequence.count(string)
cnt2 = synthetic_sequence.count(string)
pdf1.append(float(cnt1))
pdf2.append(float(cnt2))
ratio = cnt2 / cnt1
if cc % 100 == 0:
print("Evaluation: computing EPDFs: %6d/%6d: %5s %8d %8d %10.7f"
%(cc, nodes, string, cnt1, cnt2, ratio))
cc += 1
pdf1 = np.array(pdf1) # original dna sequence
pdf2 = np.array(pdf2) # synthetic dna sequence
pdf1 /= np.sum(pdf1)
pdf2 /= np.sum(pdf2)
HD = 0.0
for k in range(len(pdf1)):
HD += np.sqrt(pdf1[k]*pdf2[k])
HD = np.sqrt(1 - HD)
return(pdf1, pdf2, HD)
pdf_dna, pdf_synth, HD_synth = compute_HD(hnodes, sequence, synthetic_sequence)
pdf_dna, pdf_random, HD_random = compute_HD(hnodes, sequence, random_sequence)
print()
print("Total nodes: %6d" %(nodes))
print("Hellinger distance [synthetic]: HD = %8.5f" %(HD_synth))
print("Hellinger distance [random] : HD = %8.5f" %(HD_random))
#--- [7] Visualization (PDF scatterplot)
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['axes.linewidth'] = 0.5
plt.rcParams['xtick.labelsize'] = 7
plt.rcParams['ytick.labelsize'] = 7
plt.scatter(pdf_dna, pdf_synth, s = 0.5, color = 'red', alpha = 0.5)
plt.scatter(pdf_dna, pdf_random, s = 0.5, color = 'blue', alpha = 0.5)
plt.legend(['real vs synthetic', 'real vs random'], loc='upper left', prop={'size': 7}, )
plt.plot([0, np.max(pdf_dna)], [0, np.max(pdf_dna)], c='black', linewidth = 0.3)
plt.show()