-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_bcf.py
91 lines (76 loc) · 2.88 KB
/
parse_bcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from pybcf import BcfReader
from collections import defaultdict
import numpy as np
import moments
import pickle
def get_population_samples(samples_path):
samples = defaultdict(list)
with open(samples_path, "r") as fin:
for line in fin:
if line.startswith("CAAPA_ID"):
continue
else:
ID, POP, _, _ = line.split("\t")
samples[POP].append(ID)
return samples
def count_derived_alleles(genotypes, sample_idx, pop):
alleles, num = np.unique(genotypes[sample_idx[pop]], return_counts=True)
num = num[np.isnan(alleles) == False]
alleles = alleles[np.isnan(alleles) == False]
if len(alleles) > 1:
n = int(np.sum(num))
i = int(np.sum(num[alleles != 0]))
return (n, i)
else:
return None
def get_single_populations_sfs(bcf_path, samples):
bcf = BcfReader("CAAPA_Freeze2_NAM_MSL_CBU_TWA_BAK_chr22.bcf")
# get population sample indexes
sample_idx = {
pop: [bcf.samples.index(s) for s in ids] for pop, ids in samples.items()
}
# tally allele counts, storing (n, i) as keys
# here, n is the sample size (accounting for missing data), and i is the
# number of derived alleles
counts = {pop: defaultdict(int) for pop in samples.keys()}
for var in bcf:
genotypes = var.samples["GT"]
for pop in samples.keys():
k = count_derived_alleles(genotypes, sample_idx, pop)
if k is not None:
counts[pop][k] += 1
return counts
def build_spectra(counts):
spectra = {p: {} for p in samples.keys()}
for p in samples.keys():
for k, v in counts[p].items():
n, i = k
if n not in spectra[p]:
spectra[p][n] = np.zeros(n + 1)
spectra[p][n][i] = v
return spectra
def project_spectra(spectra, pop, nmin):
fs = moments.Spectrum(np.zeros(nmin + 1))
for n, fs_from in spectra[pop].items():
if n >= nmin:
fs += moments.Spectrum(fs_from).project([nmin])
return fs
if __name__ == "__main__":
sample_path = "samples_metadata.txt"
samples = get_population_samples(sample_path)
# count alleles in the VCF
bcf_path = "CAAPA_Freeze2_NAM_MSL_CBU_TWA_BAK_chr22.bcf"
counts = get_single_populations_sfs(bcf_path, samples)
# compile the spectra from the parsed counts
spectra = build_spectra(counts)
with open("all_1d_spectra.pkl", "wb+") as fout:
pickle.dump(spectra, fout)
# sample sizes are selected so that pi(n) / pi(max) is > 99.5%
nmax = {"Baka": 56, "Mende": 156, "Nama": 70, "BaTwa": 94, "Chabu": 42}
spectra_proj = {
pop: project_spectra(spectra, pop, nmax[pop]) for pop in nmax.keys()
}
for pop in spectra_proj.keys():
spectra_proj[pop].pop_ids = [pop]
with open("projected_1d_spectra.pkl", "wb+") as fout:
pickle.dump(spectra_proj, fout)