forked from dsi-bdi/biokg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_benchmarks.py
162 lines (136 loc) · 6.62 KB
/
build_benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from os import makedirs
from os.path import join, isdir
import gzip
from biodblinker import GeneNameLinker
from shutil import copy
def export_triplets(triplets, filepath):
""" Export triplets to file
Parameters
----------
triplets : iter
Triplets
filepath : str
File path
"""
fd = open(filepath, "w")
for s, p, o in triplets:
fd.write(f"{s}\t{p}\t{o}\n")
fd.close()
def build_benchmarks(preprocessed_dp, output_dp):
""" Build BioKG benchmarks
Parameters
----------
preprocessed_dp : str
Preprocessed directory path
output_dp : str
Output directory path
"""
benchmarks_dp = join(output_dp, "benchmarks")
makedirs(benchmarks_dp) if not isdir(benchmarks_dp) else None
# ================================================================================================
# Build DDI benchmarks
# ------------------------------------------------------------------------------------------------
# Benchmark #1: Drug-drug interactions and their effects on body minerals
# Benchmark #2: Drug-drug interactions and their effects on therapeutic efficacy
# ------------------------------------------------------------------------------------------------
ddi_desc_fp = join(preprocessed_dp, "drugbank", "db_ddi.txt")
mineral_triplets = set()
efficacy_triplets = set()
mineral_effects = {"calcemia", "glycemia", "kalemia", "atremia"}
efficacy_effects = {"efficacy"}
for line in open(ddi_desc_fp):
d1, _, d2, ddi_effect = line.strip().split("\t")
is_mineral_effect = bool(sum([mef in ddi_effect for mef in mineral_effects]))
is_efficacy_effect = bool(sum([ef in ddi_effect for ef in efficacy_effects]))
if is_mineral_effect:
mineral_triplets.add((d1, ddi_effect, d2)) if d1 > d2 else mineral_triplets.add((d2, ddi_effect, d1))
if is_efficacy_effect:
efficacy_triplets.add((d1, ddi_effect, d2)) if d1 > d2 else efficacy_triplets.add((d2, ddi_effect, d1))
ddi_chem_fp = join(benchmarks_dp, "ddi_minerals.tsv")
ddi_efficacy_fp = join(benchmarks_dp, "ddi_efficacy.tsv")
export_triplets(mineral_triplets, ddi_chem_fp)
export_triplets(efficacy_triplets, ddi_efficacy_fp)
# ================================================================================================
# Build DPI benchmarks
# ------------------------------------------------------------------------------------------------
# Benchmark #1: Drug-protein interactions of FDA approved drugs
# Benchmark #2: Drug effects of protein expression for FDA approved drugs
# ------------------------------------------------------------------------------------------------
drug_stage_fp = join(preprocessed_dp, "drugbank", "db_product_stage.txt")
drug_targets_fp = join(output_dp, "links", "dpi.txt.gz")
drug_targets_effects_fp = join(preprocessed_dp, "ctd", "ctd_drug_protein_interactions.txt")
fda_approved_drugs = set()
fda_dpi = set()
for line in open(drug_stage_fp):
dr, _, stage = line.strip().split("\t")
if stage == "approved":
fda_approved_drugs.add(dr)
for line in gzip.open(drug_targets_fp, 'rt'):
dr, _, pr = line.strip().split("\t")
if dr in fda_approved_drugs:
fda_dpi.add((dr, "DPI", pr))
dpi_fda_fp = join(benchmarks_dp, "dpi_fda.tsv")
export_triplets(fda_dpi, dpi_fda_fp)
dpi_inc_exp = set()
dpi_dec_exp = set()
for line in open(drug_targets_effects_fp):
dr, effect_txt, pr, _, _ = line.strip().split("\t")
if effect_txt == "INCREASES_EXPRESSION":
dpi_inc_exp.add((dr, pr))
if effect_txt == "DECREASES_EXPRESSION":
dpi_dec_exp.add((dr, pr))
dpi_exp_fp = join(benchmarks_dp, "dep_fda_exp.tsv")
conflict_dpi = set.intersection(dpi_inc_exp, dpi_dec_exp)
dpi_inc_exp_triplets = [[dr, "inc_expr", pr] for dr, pr in dpi_inc_exp if (dr, pr) not in conflict_dpi]
dpi_dec_exp_triplets = [[dr, "dec_expr", pr] for dr, pr in dpi_dec_exp if (dr, pr) not in conflict_dpi]
dpi_exp_triplets = dpi_inc_exp_triplets + dpi_dec_exp_triplets
dep_exp_fda_triplets = [[dr, eff, pr] for dr, eff, pr in dpi_exp_triplets if dr in fda_approved_drugs]
export_triplets(dep_exp_fda_triplets, dpi_exp_fp)
# ================================================================================================
# Build Phosphorylation benchmark
# ------------------------------------------------------------------------------------------------
# Benchmark #1: Human Kinase Substrate phosphorylation interactions from phosphosite plus and Hijazi 20
# ------------------------------------------------------------------------------------------------
linker = GeneNameLinker()
human_uniprot_ids = set()
psp_set = set()
hijazi20_set = set()
psp_phos_fp = join(preprocessed_dp, 'phosphosite', 'kinase_substrate.txt')
hijazi20_fp = join(preprocessed_dp, 'hijazi20', 'phosphorylation.txt')
phos_fp = join(benchmarks_dp, 'phosphorylation.tsv')
with open(join(preprocessed_dp, 'uniprot', 'uniprot_metadata.txt'), 'r') as fd:
for line in fd:
s, p, o = line.strip().split('\t')
if p == 'SPECIES' and o == 'HUMAN':
human_uniprot_ids.add(s)
with open(psp_phos_fp, 'r') as fd:
for line in fd:
kin, _, sub, site = line.strip().split('\t')[:4]
if kin in human_uniprot_ids and sub in human_uniprot_ids:
psp_set.add((kin, sub, site))
with open(hijazi20_fp, 'r') as fd:
for line in fd:
parts = line.strip().split('\t')
if len(parts) < 4:
print(line)
continue
kin_gene, _, sub_gene, site = parts[:4]
kin_uniprot_ids, sub_uniprot_ids = linker.convert_gene_name_to_uniprot([kin_gene, sub_gene])
for kin in kin_uniprot_ids:
if kin in human_uniprot_ids:
for sub in sub_uniprot_ids:
if sub in human_uniprot_ids:
hijazi20_set.add((kin, sub, site))
phosphorylation_set = psp_set.union(hijazi20_set)
phos_quads = [[kin, 'phosphorylates', sub, site] for kin, sub, site in phosphorylation_set]
fd = open(phos_fp, "w")
for s, p, o, c in phos_quads:
fd.write(f"{s}\t{p}\t{o}\t{c}\n")
fd.close()
def main():
preprocessed_dp = "./data/preprocessed"
output_dp = "./data/output"
build_benchmarks(preprocessed_dp, output_dp)
copy('benchmarks_readme.txt', join(output_dp, 'benchmarks', 'README.txt'))
if __name__ == '__main__':
main()