-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_proteinAnnotation.py
749 lines (562 loc) · 34.4 KB
/
get_proteinAnnotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
'''
>Python
Functions to obtain protein annotaitons mapping
(getting the annotations that either can't be obtained from PDB's API or easier to obtain from elsewhere )
source database: UniProt
#TODO: currently haven't implemented functions to incoporate existing deeploc annotation file
'''
import pandas as pd
import numpy as np
import requests
import time
import json
import re
from requests.adapters import HTTPAdapter, Retry
from ast import literal_eval
import uuid
import os
#import traceback
class ProteinAnnotation:
def __init__(self, full_df, protein_df_current_uni=None, protein_df_current_deep=None):
'''
input:
- full_df: dataframe of the biodolphin full dataset (current + queried) that are mapped with lipid annotations
- protein_df_current_uni: existing protein annotation file with uniprot information
- protein_df_current_deep: existing protein annotation file with deeploc information
'''
self.full_df = full_df # dataframe of the full dataset
if full_df is not None:
self.new_df = full_df[full_df["New_query"] == True] # dataframe of the subset that is newly queried
self.protein_df_uni = protein_df_current_uni
self.protein_df_deep = protein_df_current_deep
self.uniprotIDs = None
#self.protein_df = None
def Run(self, mode, date):
'''
Get the UniProt protein annotation file
input:
- mode: queryUniprot -> query uniprot info for the new uniprot IDs, and map all uniprot info onto the full dataframe, then prodict deeploc input file
#TODO: addDeepLoc -> add deeploc info for the new unprot IDs, combine current and queried protein annotation file and save it
'''
if mode == 'queryUniprot':
# Query uniprot info and map it to the full dataframe
self.GetIDList(self.protein_df_uni, save=True) # set self.uniprotIDs to a list of uniprot IDs that are not in the existing protein annotation uniprot file
self.QueryUniProt() # query these uniprot ID from UniProt to get information, place the updated mapping at self.protein_df_uni, and update the uniprot mapping to self.full_df
self.SaveProtAnno(self.protein_df_uni, prefix=f"protein_uniprotMap_{date}") # save the uniprot mapping information (self.protein_df_uni)
self.MapUni(self.full_df, self.protein_df_uni) #map the uniprot information to the full dataframe and place it at self.full_df
self.GetMemType_DeepLoc() # get the deep loc fasta file
return self.full_df #return the updated full dataframe (with uniprot info mapped)
elif mode == 'addDeepLoc':
# Add DeepLoc results and map it to the full dataframe
self.CombineDeepLoc() # combine the deeploc results with the unclass_protein.csv to produce protein deeploc annotation file (place at self.protein_df_deep)
self.SaveProtAnno(self.protein_df_deep, prefix=f"protein_deepLocMap_{date}")
self.MapDeep(self.full_df, self.protein_df_deep)
#self.Update() #update the protein annotation file to include the new UniProt
#self.SaveProtAnno(prefix="protein_annotations")
return self.full_df #return the updated full dataframe (with deeploc info mapped)
else:
raise Exception('Invalid mode: {mode}')
def GetIDList(self, protein_df_current_uni, save=False):
'''
Get a list of UniProt ID that needs to be annotated and save it to data/uniprotIDList.txt
input:
- protein_df_current_uni: dataframe of existing protein annotations
- save: if True, save the ID list to data/uniprotID.txt
result:
- update: self.uniprotIDs to include a list of new uniprot ID
(save the list of IDs as uniprotID.txt)
'''
print(f'getting the uniprotID list')
#get unique set of uniprot IDs
IDs = list(set(self.full_df["protein_UniProt_ID"].to_list()))
#print(f'Number of unique UniProt IDs in full (current + queried) dataset: {len(IDs)}')
if protein_df_current_uni is not None:
old_IDs = list(set(protein_df_current_uni["protein_UniProt_ID"].to_list()))
#print(f'Number of unique UniProt IDs in existing dataset: {len(old_IDs)}')
else:
old_IDs = []
#remove nan and expand the ones with multiple IDs (only getting IDs that are not in the existing annotation file)
IDs_format = []
for ID in IDs:
if (str(ID) != 'nan') and (ID is not None):
if "," in ID:
IDs_format.extend(ID.split(","))
else:
IDs_format.append(ID)
self.uniprotIDs = list(set(IDs_format)-set(old_IDs))
if save:
with open("data/uniprotID.txt", 'w') as output:
for row in self.uniprotIDs:
output.write(str(row) + '\n')
def QueryUniProt(self):
'''
Take self.uniprotIDs (new UniProt IDs that need to be queried), get uniprot protein annotations (mapping uniprotIDs to their annotations)
Save this in self.protein_df_uni
'''
if self.uniprotIDs == None:
raise Exception("need to run funciton: GetIDList first")
print(f'start querying {len(self.uniprotIDs)} uniprot IDs from UniProt')
# define subfunctions >>>
# (source:https://www.uniprot.org/help/api_queries & https://www.uniprot.org/help/id_mapping)
API_URL = "https://rest.uniprot.org"
POLLING_INTERVAL = 3
def get_next_link(headers):
if "Link" in headers:
match = re_next_link.match(headers["Link"])
if match:
return match.group(1)
def get_batch(batch_url):
while batch_url:
response = session.get(batch_url)
response.raise_for_status()
total = response.headers["x-total-results"]
yield response, total
batch_url = get_next_link(response.headers)
def check_id_mapping_results_ready(job_id):
while True:
request = session.get(f"{API_URL}/idmapping/status/{job_id}")
check_response(request)
j = request.json()
if "jobStatus" in j:
if j["jobStatus"] in ("NEW", "RUNNING"):
print(f"Job not ready: Retrying in {POLLING_INTERVAL}s")
time.sleep(POLLING_INTERVAL)
else:
raise Exception(j["jobStatus"])
else:
return bool(j["results"] or j["failedIds"])
def check_response(response):
try:
response.raise_for_status()
except requests.HTTPError:
print(response.json())
raise
# <<< define subfunctions
# get the jobID corresponding to the IDs to be queried:
IDstring = ','.join(self.uniprotIDs)
#print(f'IDstring: {IDstring}')
files = {
'from': (None, 'UniProtKB_AC-ID'),
'to': (None, 'UniProtKB'),
'ids': (None, IDstring) # IDs need to be seperated with comma without space here #TODO: why not recognizing?
}
response = requests.post('https://rest.uniprot.org/idmapping/run', files=files)
print('finished posting request')
#Check reponse status before loading the request results
re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
jobID = json.loads(response.text)['jobId']
if check_id_mapping_results_ready(jobID):
# format the url with the jobID and the fields want:
url=f'https://rest.uniprot.org/idmapping/uniprotkb/results/{jobID}?fields=accession%2Cxref_reactome%2Cxref_pfam%2Cxref_interpro%2Ccc_subcellular_location&format=json&size=500'
print(f'url: {url}')
# query from the API and put the data in a dataframe:
dfs = []
for batch, total in get_batch(url):
out_dict = json.loads(batch.text)
for entry in out_dict['results']:
json_entry = entry['to']
json_parsed_df = self.Parse_json(json_entry)
dfs.append(json_parsed_df)
df_all = pd.concat(dfs, ignore_index=True)
# classify the membrane type
df_all = self.MemType(df_all)
# concatenate old and new protein annotations, and update it to self.protein_df_uni
if self.protein_df_uni is not None:
self.protein_df_uni = pd.concat([self.protein_df_uni, df_all], ignore_index=True)
else:
self.protein_df_uni = df_all
def Parse_json(self,json_entry):
'''
subfunciton for QueryUniProt
'''
#uniprot ID
uniprotID = json_entry['primaryAccession']
#subcellular ID and key & subcellular topology ID and key
subcell_ID, subcell_key = [], []
topology_ID, topology_key = [], []
try:
comments = json_entry['comments']
for sub_comment in comments:
if sub_comment["commentType"] == "SUBCELLULAR LOCATION":
sub_info = sub_comment['subcellularLocations']
#print(f'sub_info: {sub_info}')
for item in sub_info:
#print(f'item: {item}')
subcell_key.append(item['location']['value'])
subcell_ID.append(item['location']['id'])
topology_key.append(item['topology']['value'])
topology_ID.append(item['topology']['id'])
except:
#traceback.print_exc()
#print('unable to access comments for subcellular info')
pass
#protein family - InterPro and Pfam// Metabolic Pathway - Reactome
interpro_ID, interpro_value, pfam_ID, pfam_value = [],[],[],[]
reactome_ID, reactome_value = [],[]
try:
crossref = json_entry['uniProtKBCrossReferences']
for entry in crossref:
#interpro
try:
if entry['database'] == "InterPro":
#ID
interpro_ID.append(entry['id'])
#values
properties = entry['properties']
for prop_enty in properties:
if prop_enty['key'] == 'EntryName':
interpro_value.append(prop_enty['value'])
except:
pass
#print('unable to access InterPro data')
#pfam
try:
if entry['database'] == "Pfam":
#ID
pfam_ID.append(entry['id'])
#values
properties = entry['properties']
for prop_enty in properties:
if prop_enty['key'] == 'EntryName':
pfam_value.append(prop_enty['value'])
except:
pass
#print('unable to access Pfam data')
#reactome
try:
if entry['database'] == "Reactome":
#ID
reactome_ID.append(entry['id'])
#values
properties = entry['properties']
for prop_enty in properties:
if prop_enty['key'] == 'PathwayName':
reactome_value.append(prop_enty['value'])
except:
pass
#print('unable to access reactome data')
except:
pass
#print('unable to access cross reference database data')
list_entry = [str(uniprotID), str(subcell_ID), str(subcell_key), str(topology_ID),str(topology_key),str(interpro_ID), str(interpro_value), str(pfam_ID), str(pfam_value), str(reactome_ID), str(reactome_value)]
cols = ['uniprotID', 'subcellular_ID', 'subcellular_Key', 'toplology_ID', 'topology_Key', 'InterPro_ID', 'InterPro_Key', 'Pfam_ID', 'Pfam_Key', 'Reactome_ID', 'Reactome_Key']
'''
if len(subcell_ID) != len(subcell_key) or len(interpro_ID) != len(interpro_value) or len(pfam_ID) != len(pfam_value) or len(reactome_ID) != len(reactome_value) or len(topology_ID) != len(topology_key):
raise Exception(f"inconsistent length of ID and key. uniprotID: {uniprotID} ")
#print('inconsistent length of ID and key')
'''
#assert len(subcell_ID) == len(subcell_key), f"{uniprotID} has inconsistent subcell length"
assert len(topology_ID) == len(topology_key), f"{uniprotID} has inconsistent topology length, ID:{topology_ID}, key:{topology_key}"
for i in range(len(list_entry)):
if list_entry[i] == "[]":
list_entry[i] = ""
df_entry = pd.DataFrame([list_entry], columns=cols)
return df_entry
def MemType(self, df):
'''
Subfunction of QueryUniProt()
Generate a dataframe with membrane type classification results without the original subloc details
input:
- df: the dataframe of results from uniprot
output:
- df_class: dataframe with membrane types
'''
#define membrane type mapping (SL-ID from: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/subcell.txt)
peripheral_IDs = ['SL-9903']
lipidanchor_IDs = ['SL-9901', 'SL-9902', 'SL-9920']
transmembrane_IDs = ['SL-9909', 'SL-9904', 'SL-9905', 'SL-9906', 'SL-9907', 'SL-9908']
soluble = ['SL-0170', 'SL-0215', 'SL-0309', 'SL-0086', 'SL0188', 'SL-0190', 'SL-0243', 'SL-0096', 'SL-0156', 'SL-0133', 'SL-0202', 'SL-0200']
df_class = df.copy()
def classify(subcellular_ID, toplology_ID):
classes = []
#if not pd.isna(toplology_ID):
if not toplology_ID == "":
for ID in literal_eval(toplology_ID):
if ID in peripheral_IDs:
classes.append("Peripheral")
if ID in lipidanchor_IDs:
classes.append("Lipid-anchor")
if ID in transmembrane_IDs:
classes.append("Transmembrane")
if not subcellular_ID == "":
for ID in literal_eval(subcellular_ID):
if ID in soluble:
classes.append("Soluble")
if len(classes) == 0:
return ""
else:
return list(set(classes))
df_class['MembraneType_UniProt'] = df_class.apply(lambda x: classify(x.subcellular_ID, x.toplology_ID), axis=1)
df_class.drop(columns=['subcellular_ID', 'subcellular_Key', 'toplology_ID', 'topology_Key'], inplace=True)
return df_class
def MapUni(self, full_dataframe, protein_uniprot_dataframe):
'''
Map the uniprot information to the full dataframe, and update the mapped dataframe as self.full_df
input:
- full_dataframe: full dataframe
- protein_uniprot_dataframe: protein uniprot annotation file
'''
full_dataframe['uniprot_single'] = full_dataframe['protein_UniProt_ID'].map(lambda x: x.split(',')[0] if (isinstance(x, str) and (',' in x)) else x) #get the first uniprot ID for mapping
protein_uniprot_dataframe = protein_uniprot_dataframe.rename(columns={'uniprotID':'uniprot_single'}) #change the column name to map onto full_dataframe
full_dataframe = full_dataframe.merge(protein_uniprot_dataframe, how='left', on='uniprot_single')
full_dataframe.drop(columns=['uniprot_single'], inplace=True)
#update self.full_df
self.full_df = full_dataframe
def GetMemType_DeepLoc(self):
'''
(1) Save a pre-deeploc annotation file (matching: seqID, sequences, uniprotIDs) that needs deepLoc to classify
save this file as unclass_protein.csv (seqID as headers in the fasta file)
# For entries with uniprotIDs, remove entries with repeating uniprotIDs
# For entries without uniprotIDs, remove entries with repeating sequences
(2) Get the fasta files for deeploc submission
'''
directory = 'data/DeepLoc'
os.makedirs(directory, exist_ok = True)
# Get a subset of full_df (unclassified_df) where the entries don't have already have the membrane classifications:
self.full_df['MembraneType_UniProt'] = self.full_df['MembraneType_UniProt'].apply(lambda x: np.nan if isinstance(x, str) and (not x) else x) #replace empty strings with np.nan
unclassified_df = self.full_df.loc[self.full_df['MembraneType_UniProt'].isna()]
unclassified_df = unclassified_df[['protein_UniProt_ID', 'protein_Sequence']]
# TODO: Delete the already classified entries (if there is existing deepLoc annotation file)
# Process uniprotID and drop the duplicates for uniprotID
unclassified_df['protein_UniProt_ID'] = unclassified_df['protein_UniProt_ID'].apply(lambda x: np.nan if isinstance(x, str) and (not x) else x) #replace empty strings with np.nan
unclassified_df['protein_UniProt_ID'] = unclassified_df['protein_UniProt_ID'].map(lambda x: x.split(',')[0] if (isinstance(x, str) and (',' in x)) else x) #get the first uniprot ID for mapping
unclassified_df = unclassified_df[(~unclassified_df['protein_UniProt_ID'].duplicated()) | (unclassified_df['protein_UniProt_ID'].isna())] #drop uniprot duplicates but don't drop nan
# format the sequence to remove "()"
unclassified_df['protein_Sequence'] = unclassified_df['protein_Sequence'].map(lambda x: x.replace('(','').replace(')',''))
#keep a dictionary to map unique uniprotID to seqID
dict_Uni2SeqID = {}
for uniprotID in unclassified_df['protein_UniProt_ID'].to_list():
if not pd.isnull(uniprotID):
ID = uuid.uuid4()
dict_Uni2SeqID[uniprotID] = ID
#keep a dictionary to map unique sequences (without uniprotIDs) to seqID
seq_withoutUniID = list(set(unclassified_df.loc[unclassified_df['protein_UniProt_ID'].isna(), 'protein_Sequence'].to_list()))
dict_Seq2SeqID = {}
for seq in seq_withoutUniID:
ID = uuid.uuid4()
dict_Seq2SeqID[seq] = ID
# Generate the column seqID
def getSeqID(uniprot, sequence):
if (isinstance(uniprot, str)):
if uniprot != "":
seqID = dict_Uni2SeqID[uniprot]
return seqID
seqID = dict_Seq2SeqID[sequence]
return seqID
unclassified_df['seqID'] = unclassified_df.apply(lambda x: getSeqID(x.protein_UniProt_ID, x.protein_Sequence), axis=1)
unclassified_df.drop_duplicates(subset=['seqID'], inplace=True) #elimnate the duplicates of the sequences from the entries without uniprot IDs
unclassified_df.to_csv("data/DeepLoc/unclass_protein.csv") #save this file
# Format them into a FASTA format and save it
seqs = unclassified_df['protein_Sequence'].tolist()
seqIDs = unclassified_df['seqID'].tolist()
#save multiple fasta files with 500 sequences(max num of sequence is 500 on deeploc webserver)
i=0
for x in range(0, len(seqs), 500):
i+=1
sub_seqIDs, sub_seqs = seqIDs[x:x+500], seqs[x:x+500]
with open(f"data/DeepLoc/deeplocInput_{i}.fasta", "w") as f:
for id, seq in zip(sub_seqIDs, sub_seqs):
f.write(">" + str(id) + "\n" + seq + "\n")
print(f'FASTA file deeplocInput_<i>.fasta produced in data. Upload the files to https://services.healthtech.dtu.dk/services/DeepLoc-2.1/ select: slow model and short output')
print(f'Please upload all the result csv files to data/DeepLoc')
def GetMemType_DeepLoc_Old(self):
'''
Get the fasta file of all proteins without classifications and save unclass_protein.csv which maps uniprot IDs, sequence --> sequenceID (as headers in the fasta file)
'''
directory = 'data/DeepLoc'
os.makedirs(directory, exist_ok = True)
# Get a list of UniProt IDs that already have the membrane classifications:
protein_df = pd.read_csv("data/protein_uniprotMap.txt", sep='\t')
protein_df['MembraneType_UniProt'] = protein_df['MembraneType_UniProt'].apply(lambda x: np.nan if isinstance(x, str) and (not x) else x) #replace empty strings with np.nan
UniProt_classified = (protein_df.loc[~protein_df['MembraneType_UniProt'].isna(), 'uniprotID']).to_list()
# Get a list of UniProt Ids are in the existing protein annotation file
if self.protein_df_current is not None:
UniProt_Ignore = self.protein_df_current['protein_UniProt_ID'].to_list()
else:
UniProt_Ignore = []
# Get a dataframe of unique protein sequences that need to be classified for membrane classifications:
biodolphin_df_unclassified = self.full_df.loc[~self.full_df['protein_UniProt_ID'].isin(UniProt_classified), ['protein_UniProt_ID', 'protein_Sequence']] #not classified by uniprot already
biodolphin_df_unclassified = biodolphin_df_unclassified.loc[~biodolphin_df_unclassified['protein_UniProt_ID'].isin(UniProt_Ignore)] # not in the protein annotation file already
# format the sequence to remove "()"
biodolphin_df_unclassified['protein_Sequence'] = biodolphin_df_unclassified['protein_Sequence'].map(lambda x: x.replace('(','').replace(')',''))
# drop duplicates of protein sequence
biodolphin_df_unclassified.drop_duplicates(subset=['protein_Sequence'],inplace=True)
# drop duplicates of uniprot ID (but keep ones without uniprot IDs)
biodolphin_df_unclassified = biodolphin_df_unclassified[(~biodolphin_df_unclassified['protein_UniProt_ID'].duplicated()) | (biodolphin_df_unclassified['protein_UniProt_ID'].isna())]
# assign sequence ID for matching
seqIDs = [uuid.uuid4() for x in range(biodolphin_df_unclassified.shape[0])]
biodolphin_df_unclassified['sequenceID'] = seqIDs
biodolphin_df_unclassified.to_csv("data/DeepLoc/unclass_protein.csv")
# Format them into a FASTA format and save it
seqs = biodolphin_df_unclassified['protein_Sequence'].tolist()
#print(f"number of protien sequences:{len(seqs)}")
#save multiple fasta files with 500 sequences(max num of sequence is 500 on deeploc webserver)
i=0
for x in range(0, len(seqs), 500):
i+=1
sub_seqIDs, sub_seqs = seqIDs[x:x+500], seqs[x:x+500]
with open(f"data/DeepLoc/deeplocInput_{i}.fasta", "w") as f:
for id, seq in zip(sub_seqIDs, sub_seqs):
f.write(">" + str(id) + "\n" + seq + "\n")
print(f'FASTA file deeplocInput_<i>.fasta produced in data. Upload the files to https://services.healthtech.dtu.dk/services/DeepLoc-2.1/ select: slow model and short output')
print(f'Please upload all the result csv files to data/DeepLoc')
def CombineDeepLoc(self):
'''
Read the results file from DeepLoc and incoporate the classifications
Result: update self.protein_df_deep as the protein annotation file with deeploc information (seqID, uniprotID, sequence, MemClass-deeploc)
'''
# results of deeploc: get df_ID2Mem to map seqID to MembraneType_DeepLoc
fn_results = [filename for filename in os.listdir('./data/DeepLoc') if filename.startswith("results_")]
dfs_results = [pd.read_csv('./data/DeepLoc/'+fn) for fn in fn_results]
df_ID2Mem = pd.concat(dfs_results)[["Protein_ID", "Membrane types"]].rename(columns={"Protein_ID": "seqID", "Membrane types":"MembraneType_DeepLoc"})
df_ID2Mem['MembraneType_DeepLoc'] = df_ID2Mem['MembraneType_DeepLoc'].apply(lambda x: x.split("|"))
# read the unclass_protein.csv (protein_UniProt_ID,protein_Sequence,seqID) and map MembraneType_DeepLoc onto it
unclass_df = pd.read_csv('./data/DeepLoc/unclass_protein.csv',index_col=False)
predict_df = unclass_df.merge(df_ID2Mem,how='left', on='seqID') #predict_df: (protein_UniProt_ID, protein_Sequence, seqID, MembraneType_DeepLoc)
self.protein_df_deep = predict_df
#create entires for proteins with more than one uniprot IDs
#predict_df['protein_UniProt_ID'] = predict_df['protein_UniProt_ID'].str.split(',')
#predict_df = predict_df.explode('protein_UniProt_ID')
#predict_df = predict_df[(~predict_df['protein_UniProt_ID'].duplicated()) | (predict_df['protein_UniProt_ID'].isna())] #drop duplicates
# combine the prediction file and the uniprot annotated file as a final protein annotation file:
# map the ones with uniprot IDs:
#uniprot_df = pd.read_csv('data/protein_uniprotMap.txt', sep='\t')
#uniprot_df = uniprot_df.rename(columns={'uniprotID':'protein_UniProt_ID'})
#combinded_df = uniprot_df.merge(predict_df[["protein_UniProt_ID","MembraneType_DeepLoc"]], how='left', on='protein_UniProt_ID')
#combinded_df['withUniProtID'] = True
#combinded_df['protein_Sequence'] = ""
# map the ones without uniprot IDs and place them to the dataframe too:
#predict_df_sequence = predict_df[predict_df['protein_UniProt_ID'].isna()][["protein_Sequence","MembraneType_DeepLoc"]]
#predict_df_sequence['withUniProtID'] = False
#combinded_df = pd.concat([combinded_df, predict_df_sequence], ignore_index=True, sort=False)
# rename columns with protein prefix:
#combinded_df = combinded_df.rename(columns={"InterPro_ID": "protein_InterPro_ID", "InterPro_Key":"protein_InterPro_Key", "Pfam_ID":"protein_Pfam_ID", "Pfam_Key":"protein_Pfam_Key", "Reactome_ID":"protein_Reactome_ID", "Reactome_Key":"protein_Reactome_Key", "MembraneType_UniProt":"protein_MembraneType_UniProt", "MembraneType_DeepLoc":"protein_MembraneType_DeepLoc"})
#combinded_df = combinded_df.loc[:, ~combinded_df.columns.str.contains('^Unnamed')] #remove extra index columns
# place the combined dataframe to self.protein_df:
#self.protein_df = combinded_df
print('Finished combining protein annotation dataframes from deeploc and uniprot')
def MapDeep(self, full_dataframe, protein_deep_dataframe):
'''
Map the uniprot information to the full dataframe, and update the mapped dataframe as self.full_df
input:
- full_dataframe: full dataframe
- protein_deep_dataframe: protein deeploc annotation file
'''
# map the deepLoc prediction from uniprotID (for the entries with uniprotID)
full_dataframe['uniprot_single'] = full_dataframe['protein_UniProt_ID'].map(lambda x: x.split(',')[0] if (isinstance(x, str) and (',' in x)) else x) #get the first uniprot ID for mapping
protein_df_uniprot = protein_deep_dataframe.loc[~protein_deep_dataframe['protein_UniProt_ID'].isna()] # subset of protein annotation file where the protein has UniProt IDs
protein_df_uniprot = protein_df_uniprot[['protein_UniProt_ID', 'MembraneType_DeepLoc']] #get the needed columns
protein_df_uniprot.rename(columns={'protein_UniProt_ID':'uniprot_single'}, inplace=True)
full_dataframe = full_dataframe.merge(protein_df_uniprot, how='left', on='uniprot_single')
full_dataframe.drop(columns=['uniprot_single'], inplace=True)
# map the deepLoc prediction from sequence (for the entreis without uniprotID)
full_dataframe['protein_Sequence_format'] = full_dataframe['protein_Sequence'].map(lambda x: x.replace('(','').replace(')',''))
protein_df_sequence = protein_deep_dataframe.loc[protein_deep_dataframe['protein_UniProt_ID'].isna()] # subset of protein annotation file where the proteins don't have UniProt IDs (need to be matched by protein sequences)
protein_df_sequence.rename(columns={'protein_Sequence':'protein_Sequence_format'}, inplace=True)
protein_df_sequence = protein_df_sequence[['protein_Sequence_format', 'MembraneType_DeepLoc']]
full_dataframe = full_dataframe.merge(protein_df_sequence, how='left', on='protein_Sequence_format')
# process the repeating protein_MembraneType_DeepLoc from the two methods above
def SelectDeepLoc(x, y):
if isinstance(x, list):
return x
else:
return y
full_dataframe['protein_MembraneType_DeepLoc'] = full_dataframe.apply(lambda x: SelectDeepLoc(x.MembraneType_DeepLoc_x, x.MembraneType_DeepLoc_y), axis=1)
full_dataframe.drop(columns=['protein_Sequence_format'] , inplace=True)
full_dataframe.drop(columns=['MembraneType_DeepLoc_x', 'MembraneType_DeepLoc_y'] , inplace=True)
#update self.full_df
self.full_df = full_dataframe
def Format(self):
'''
Format the final dataframe
Result:
Place the formatted full dataframe to self.full_df
'''
df = self.full_df
# remove columns
df = df.drop(columns=['New_query'])
# rename columns
df = df.rename(columns={"InterPro_ID": "protein_InterPro_ID", "InterPro_Key": "protein_InterPro_Key", "Pfam_ID":"protein_Pfam_ID", "Pfam_Key":"protein_Pfam_Key", "Reactome_ID":"protein_Reactome_ID", "Reactome_Key":"protein_Reactome_Key", "MembraneType_UniProt":"protein_MembraneType_UniProt"})
# adjust column orders
df = df[['BioDolphinID', 'complex_PDB_ID', 'complex_Resolution', 'complex_Receptor_Chain', \
'complex_Receptor_asymChain', 'complex_Ligand_Chain', \
'complex_Ligand_Serial_Number', 'complex_Residue_number_of_the_ligand', \
'complex_Binding_Site_Number_Code', \
'complex_Binding_Site_Residues_(PDB)', \
'complex_Binding_Site_Residues_(Re-numbered)', \
'complex_Catalytic_Site_Residues_(PDB)', \
'complex_Catalytic_Site_Residues_(Re-numbered)', \
'complex_Binding_Affinity_(Manual_Survey)', \
'complex_Binding_Affinity_(Binding_MOAD_database)', \
'complex_Binding_Affinity_(PDBbind-CN_database)', \
'complex_Binding_Affinity_(BindingDB_database)', 'complex_PubMed_ID', \
'protein_UniProt_ID', 'protein_Name', 'protein_Synonyms', \
'protein_Sequence', 'protein_EC_number', 'protein_Organism', \
'protein_Biological_Process_(GO)', 'protein_Cellular_Component_(GO)', \
'protein_Molecular_Function_(GO)', 'entry_source', 'protein_InterPro_ID', 'protein_InterPro_Key', \
'protein_Pfam_ID', 'protein_Pfam_Key', 'protein_Reactome_ID', 'protein_Reactome_Key', \
'protein_MembraneType_UniProt', 'protein_MembraneType_DeepLoc', \
'lipid_Ligand_ID_CCD', 'lipid_PDB_Name', 'lipid_Synonyms', \
'lipid_InChI', 'lipid_InChIKey', 'lipid_Canonical_smiles', \
'lipid_Isomeric_smiles', 'lipid_IUPAC_name', 'lipid_Cid', \
'lipid_drugbank', 'lipid_ChEBI', 'lipid_ChEMBL', \
'lipid_Molecular_formula', 'lipid_Molecular_weight', \
'lipid_Lipidmaps_categories', 'lipid_Lipidmaps_terms', \
'lipid_class_source', 'lipid_pharmacological_class', 'lipid_adverse', \
'lipid_disease']]
self.full_df = df
'''
def Update(self):
Combine old protein annotation file with new protein annotation file
Result:
Save the updated annotation file at self.protein_df
new_protein_df = self.protein_df
old_protein_df = self.protein_df_current
if old_protein_df is not None:
assert new_protein_df.columns == old_protein_df.columns
self.protein_df = pd.concat([old_protein_df, new_protein_df])
else:
self.protein_df = new_protein_df
'''
def SaveProtAnno(self, df, prefix):
'''
save the processed protein annotation file stored in self.protein_df to protein_annotations.txt and protein_annotations.csv
input:
- df: protein annotation dataframe to save
- prefix: prefix of the filename to save as
results:
- data/prefix.txt: mapping of protein information in txt format
- data/prefix.csv: mapping of protein information in csv format
'''
protein_filepath = "data/"+prefix
protein_filepath_txt, protein_filepath_csv = protein_filepath + ".txt", protein_filepath + ".csv"
print(f'saving the protein mapping dataframe to: {protein_filepath_txt} and {protein_filepath_csv}')
df.to_csv(protein_filepath_txt, index=False,sep='\t')
df.to_csv(protein_filepath_csv, index=False)
if __name__ == "__main__":
full_df = pd.read_csv('data/BioDolphin_vr1.0.txt', sep='\t')
full_df["New_query"] = True
protannotator = ProteinAnnotation(full_df)
#protannotator.protein_df = pd.read_csv("data/protein_uniprotMap.txt", sep='\t')
#protannotator.GetMemType_DeepLoc()
df = protannotator.Run(mode='queryUniprot', date='test')
df.to_csv('test_mapuni.csv')
print('finished')
'''
# to get protein annotations from uniprot:
t_start = time.time()
print('defining protien annotation class:')
protannotator = ProteinAnnotation('BioDolphin_vr1.0.txt')
print('getting ID list')
protannotator.GetIDList(save=True)
print('querying from uniprot')
protannotator.QueryUniProt()
#print('saving the files')
protannotator.SaveProtAnno()
#print(len(protannotator.uniprotIDs))
t_end = time.time()
print(f'run time: {t_end - t_start} sec')
'''