-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_domain_importance.py
138 lines (119 loc) · 5.8 KB
/
get_domain_importance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/env python
#######################################################################
# Copyright (C) 2022 Julian Dosch
#
# This file is part of Spice.
#
# get_domain_importance is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# get_domain_importance is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Spice. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################
import argparse
import json
import os
from pathlib import Path
def option_parse():
parser = argparse.ArgumentParser(epilog="This script uses the annotation file of the different protein isoforms of each gene in a proteome to generate a dictionary of features that have presence/absence changes due to AS.")
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument("-i", "--inPath", default='.', type=str, required=True,
help="path to input json")
required.add_argument("-o", "--outPath", default='.', type=str, required=True,
help="path to output directory. Name will be based on input file.")
optional.add_argument("-l", "--isoformList", default=None, type=str, required=False,
help="path to result file containing the relevant (expressed) isoforms when comparing two conditions.")
args = parser.parse_args()
main(args.inPath, args.outPath, args.isoformList)
def main(inpath, outpath, isoformlist):
rel_isoforms = None
if isoformlist:
rel_isoforms = read_isoformlist(isoformlist)
genes = read_input(inpath, rel_isoforms)
output = get_fdict(genes)
save2json(output, outpath)
def read_isoformlist(path):
isoformdict = {}
with open(path, 'r') as infile:
for line in infile.readlines():
cells = line.rstrip('\n').split('\t')
if not (cells[0] == 'gene_id' or line[0] == '!'):
isoformdict[cells[0]] = cells[1].split(';')
return isoformdict
def save2json(dict2save, directory):
Path(directory).mkdir(parents=True, exist_ok=True)
jsonOut = json.dumps(dict2save, ensure_ascii=False)
out = open(directory + '/important_features.json', 'w')
out.write(jsonOut)
out.close()
def get_fdict(genes): # generates dict {feature: [gene1, gene2...]}
f_dict = {}
for gene in genes:
f_list = []
for isoform in genes[gene]['isoforms']:
for feature in genes[gene]['isoforms'][isoform]['types']:
if not feature in f_list:
if genes[gene]['isoforms'][isoform]['types'][feature] < genes[gene]['max'][feature]:
f_list.append(feature)
for clan in genes[gene]['isoforms'][isoform]['clans']:
if not clan in f_list:
if genes[gene]['isoforms'][isoform]['clans'][clan] < genes[gene]['max'][clan]:
f_list.append(clan)
for feature in f_list:
if not feature in f_dict:
f_dict[feature] = []
f_dict[feature].append(gene)
return f_dict
def read_input(inpath, isoformlist): # reads input json that contains the isoform annotations and restructures the data in a gene centric fashion
with open(inpath, 'r') as infile:
in_dict = json.loads(infile.read())
features, clans = in_dict['feature'], in_dict['clan']
genes = {}
for protein in features:
gid, pid, tid = protein.split('|') # isoform id structure: geneid|proteinid|taxid
gtmp = True
if isoformlist:
if not gid in isoformlist:
gtmp = False
if gtmp:
if not gid in genes:
genes[gid] = {'max': {}, 'isoforms': {}}
isotmp = True
if isoformlist:
if not pid in isoformlist[gid]:
isotmp = False
if isotmp:
genes[gid]['isoforms'][pid] = {'types': {}, 'clans': {}}
for tool in features[protein]:
if not tool=="length":
for ftype in features[protein][tool]:
count = len(features[protein][tool][ftype]["instance"])
if not ftype in genes[gid]['max']:
genes[gid]['max'][ftype] = count
elif count > genes[gid]['max'][ftype]:
genes[gid]['max'][ftype] = count
genes[gid]['isoforms'][pid]['types'][ftype] = count
if ftype in clans:
clan = clans[ftype]
if not clan in genes[gid]['isoforms'][pid]['clans']:
genes[gid]['isoforms'][pid]['clans'][clan] = count
else:
genes[gid]['isoforms'][pid]['clans'][clan] += count
for clan in genes[gid]['isoforms'][pid]['clans']:
count = genes[gid]['isoforms'][pid]['clans'][clan]
if not clan in genes[gid]['max']:
genes[gid]['max'][clan] = count
elif count > genes[gid]['max'][clan]:
genes[gid]['max'][clan] = count
return genes
if __name__ == '__main__':
option_parse()