-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaggregate_mutations.py
executable file
·92 lines (80 loc) · 2.75 KB
/
aggregate_mutations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Implements ``snakemake`` rule to aggregate all mutations across primer sets."""
import re
import Bio.Data.IUPACData
import pandas as pd
# get variables from snakemake
paired_positiveSel_primers = snakemake.input.paired_positiveSel_primers
usher_primers = snakemake.input.usher_primers
positiveSel_primers = snakemake.input.positiveSel_primers
gisaid_primers = snakemake.input.gisaid_primers
output_csv = snakemake.output.csv
amino_acids = Bio.Data.IUPACData.protein_letters
# read paired positive selection sites
paired_positive = (
pd.read_csv(paired_positiveSel_primers, header=None, names=['name', 'primer'])
['name'].str
.extract(
'^PS_paired\-(?:for|rev)\-mut_epi\-(?P<site1>\d+)'
'(?:\.0)?\-(?P<site2>\d+)(?:\.0)?(?:_NN[CG]){1,2}$'
)
.melt(value_vars=['site1', 'site2'], value_name='site')
[['site']]
.assign(
mutation_type='paired positive selection',
site=lambda x: x['site'].astype(int),
amino_acid=amino_acids,
)
.drop_duplicates()
.assign(amino_acid=lambda x: x['amino_acid'].map(list))
.explode('amino_acid')
[['site', 'amino_acid', 'mutation_type']]
)
assert paired_positive.notnull().all().all()
# read positive selection sites
positive = (
pd.read_csv(positiveSel_primers)
['Primer name'].str
.extract('^positiveSelectionSpike_NN[CG]\-(?:for|rev)\-mut(?P<site>\d+)$')
.assign(
site=lambda x: x['site'].astype(int),
mutation_type='positive selection',
amino_acid=amino_acids,
)
.drop_duplicates()
.assign(amino_acid=lambda x: x['amino_acid'].map(list))
.explode('amino_acid')
[['site', 'amino_acid', 'mutation_type']]
)
assert positive.notnull().all().all()
# read UsHER recurrent mutations
recurrent = (
pd.read_csv(usher_primers, header=None, names=['name', 'primer'])
['name'].str
.extract('^variant_usher\-(?:for|rev)\-mut(?P<site>\d+)(?P<amino_acid>[A-Z])$')
.assign(
mutation_type='recurrent mutation',
site=lambda x: x['site'].astype(int),
)
.drop_duplicates()
[['site', 'amino_acid', 'mutation_type']]
)
assert recurrent.notnull().all().all()
# read GISAID observed mutations
observed = (
pd.read_csv(gisaid_primers)
['primer_name'].str
.extract('^variantGISAID\-(?:for|rev)\-mut(?P<site>\d+)(?P<amino_acid>[A-Z\-])$')
.assign(
mutation_type='observed mutation',
site=lambda x: x['site'].astype(int),
)
.drop_duplicates()
[['site', 'amino_acid', 'mutation_type']]
)
assert observed.notnull().all().all()
# concatenate everything and write to file
(pd.concat([paired_positive, positive, recurrent, observed])
.sort_values(['mutation_type', 'site', 'amino_acid'])
.rename(columns={"site": "sequential_site"})
.to_csv(output_csv, index=False)
)