-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathneigh.py
230 lines (157 loc) · 6.96 KB
/
neigh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
## Neigh packag v2 ##
# Joaquin Giner Lamia
from __future__ import division
from pymongo import MongoClient
from collections import Counter
import os,sys
import json
## self packages
import mongo_client
from neigh_functions import *
### connection to mongo client using mongo_client.py ###
coll_unigenes = mongo_client.mongo_connect()[0]
coll_clusters = mongo_client.mongo_connect()[1]
coll_e5 = mongo_client.mongo_connect()[2]
###### Parametters for the analysis #####
maximum_gmgc_genes = int(400) # maximun number of unigene allowed by gmgc cluster to be computed, smaller the number smaller computing time
percentage_cut_off = int(30) # percentage of cogs (porcentaje de keggs necesario para considerar un kegg como posible candidato entre todos los neighbours keggs)
neighbours_range = 2 # number of neighbours genes to analyze around each unigene
#### input file name
global input_unigene_file
input_unigene_file =""
## data input
gmgc_list = []
try:
sys.argv[1] == True
except:
sys.exit("\nError: please use one of these options\nUsage: neigh.py -u <unigene cluster> or \n\tneigh.py -f <file(unigenes_list).txt>")
if sys.argv[1] == "-f":
try:
argument = "-f"
input_unigene_file= sys.argv[2]
input_file = open(input_unigene_file,"r")
for line in input_file:
gmgc_list.append(line.strip())
except:
sys.exit("\nError: please add a unigene\nUsage: neigh.py -u <unigene> or \nneigh.py -f <file(unigenes_list).txt>")
elif sys.argv[1] == "-u":
try:
argument = "-u"
input_gmgc= sys.argv[2]
gmgc_list.append(input_gmgc)
except:
sys.exit("\nError: please add a unigene\nUsage: neigh.py -u <unigene> or \nneigh.py -f <file(unigenes_list).txt>")
##### NEIGH FUNCTIONAL ANALYSIS ####
#note gmgc means unigene cluster
def neigh_analysis(gmgc_list,neighbours_range):
## RESULT LIST FOR OUTPUT WRITE
result_kegg_for_output_file=[]
result_egg_for_output_file=[]
result_unigenes_for_output_file=[]
for gmgc in gmgc_list:
if "GMGC" in gmgc:
gmgc = clean_unigene(gmgc)
## RESULT LIST
result_list_kegg = []
result_list_egg = []
list_of_neigh_unigenes = []
#Retrieve neighbours genes for every ORFs in the GMGC cluster
#retrieve orfs sample_contig_gene that belong to unigene cluster
gmgc_orfs_cluster = mongo_orf_find(gmgc,maximum_gmgc_genes,coll_unigenes)
#retrieve neighbours genes for every orfs cluster, by default -2-1+1+2
neighbours_genes= retrieve_neighbours_data(gmgc_orfs_cluster,neighbours_range)
count_for_list_of_unigenes = 0 #Creates the unigenes list only one time
for cog in ("kegg","Egg"):
cog_category = cog
count_for_list_of_unigenes +=1
#retrieve cogs assignation list and neighbourhood parametters
cogs_assignation_list = compute_neigh_cogs_assignation(neighbours_genes,neighbours_range,coll_unigenes,coll_clusters,cog_category)
cogs_organization_list = cogs_assignation_list[0]
analysed_orfs = cogs_assignation_list[1]
number_neigh = cogs_assignation_list[2]
number_neigh_with_cogs = cogs_assignation_list[3]
unigenes_functions_list = cogs_assignation_list[4]
# number of genes that have at least one cog in his neighobours
neigh_orf_with_cogs = get_neigh_orf_with_cogs(cogs_organization_list)
# unigene cluster and their neighbours unigenes for future analysis
if count_for_list_of_unigenes < 2:
neigh_unigenes = create_unigenes_lists_for_print(gmgc,neighbours_genes,neighbours_range,coll_unigenes)
list_of_neigh_unigenes.append(neigh_unigenes)
if list_of_neigh_unigenes[0].rstrip() == gmgc:
list_of_neigh_unigenes= [gmgc,"singleton"]
###
gmgc_functional = mongo_functional_find(gmgc,coll_clusters)# retrieve functional information from GMGC query
if cog_category == "kegg":
query_list = gmgc_functional[0]
else:
query_list = gmgc_functional[1]
# compute total number of cogs and unique cogs in the unigene cluster
Count = Counter(unigenes_functions_list)
unique_cogs = 0
count_of_cogs = 0
for k,v in Count.items():
count_of_cogs += int(v)
unique_cogs +=1
# This section compute if any of the cogs are highly representated in the neighbourhood of the unigenes
# In the case there are cogs overepresentated they are storage in kegg_description_dict
kegg_description_dict = {}
subject_cog_list = [] # store subject list of cog
subject_cog_dict = {} #almacenamos todos los kegg que pasen el porcentaje para compararlos con los query_kegg_list
for k,v in Count.items(): # v is the number of time this function has been found in the neighbours genes
cog = k
KEGG_count = str(v)
percentage = float(v/neigh_orf_with_cogs)*100 # compute percentage of number of neighbours genes with cogs
percentage = ("{0:.2f}".format(percentage))
if float(percentage) >= percentage_cut_off:
if cog_category == "kegg": # control for cog_category output
kegg_description_dict[cog] = get_kegg_description(cog)
else:
kegg_description_dict[cog] = get_egg_description(cog,coll_e5)
subject_cog_dict[cog]=percentage
subject_cog_list.append(cog)
# Get funciontal scores and cogs description of enriched cogs
if kegg_description_dict != {}:
# get functional description and COG hits
description_list = get_functional_data(subject_cog_dict, query_list, kegg_description_dict, cog_category)
hits, query_list, description_list, hit_kegg_percentage, num_query_cog = [description_list[i] for i in (0,1,2,3,4)]
# get functional scores
score_list = neigh_scores(hits,num_query_cog,subject_cog_list,unique_cogs,count_of_cogs,neigh_orf_with_cogs)
positive_value, accuracy_value, func_conservation = [score_list[i] for i in (0,1,2)]
### LIST FOR OUTPUT SECTION ###
result =[gmgc,
",".join(query_list),
",".join(subject_cog_list),
str(analysed_orfs),
str(number_neigh),
str(number_neigh_with_cogs),
str(unique_cogs),
str(count_of_cogs),
str(func_conservation),
",".join(hit_kegg_percentage),
";".join(description_list)
]
if cog_category == "kegg":
result_list_kegg.append(result)
else:
result_list_egg.append(result)
else:
result = [gmgc,"No Match"]
if cog_category == "kegg":
result_list_kegg.append(result)
else:
result_list_egg.append(result)
result_kegg_for_output_file.append(result_list_kegg)
result_egg_for_output_file.append(result_list_egg)
result_unigenes_for_output_file.append(list_of_neigh_unigenes)
return [result_kegg_for_output_file,result_egg_for_output_file,result_unigenes_for_output_file]
def main(gmgc_list):
global kegg_dict
kegg_pathways = open("data/KEGGs_pathways.txt","r")
kegg_dict = make_kegg_dict(kegg_pathways)
result_list = neigh_analysis(gmgc_list,neighbours_range)
kegg_result = result_list[0]
egg_result = result_list[1]
list_of_neighbours_unigenes = result_list[2]
print_results(argument, kegg_result, egg_result, list_of_neighbours_unigenes,input_unigene_file)
if __name__ == main(gmgc_list):
main(gmgc_list)