-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
216 lines (148 loc) · 8.64 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
'''
Main script curate new data and integrate them into BioDolphin
Assumption is:
under data/, we have: (1) BioDolphin_vr1.0.txt (2) lipid_annotations.txt (3) protein_annotations.txt
Usage:
(1) when complete lipid annotation file exist. Only need to query new PDB entries
--> python main.py -d BioDolphin_vr1.0.txt -l lipid_annotations.txt -o BioDolphin_vr1.1.txt --update
(2) don't have lipid annotaiton file yet.
python3 main.py [options]
'''
import argparse
from get_lipidAnnotation import *
from get_proteinAnnotation import *
from get_newEntries import *
from combine import *
from affinity import *
from get_format import *
import warnings
warnings.filterwarnings("ignore")
if __name__ == "__main__":
# setup arguments
parser = argparse.ArgumentParser(description='Add arguments')
parser.add_argument('-d','--dataset', help='current dataset filename (.txt) in the data directory', default="BioDolphin_vr1.0.txt", type=str, required=False)
parser.add_argument('-l','--lipidfile', help='lipid anotation file filename (.txt) in the data directory', default=None, type=str, required=False)
parser.add_argument('-p','--proteinfile_uni', help='protein anotation (uniprot) file filename (.txt) in the data directory', default=None, type=str, required=False)
parser.add_argument('-m','--proteinfile_deep', help='protein anotation (deeploc membrane class prediction) file filename (.txt) in the data directory', default=None, type=str, required=False)
parser.add_argument('-o','--output', help='output dataset filename prefix in the data directory', type=str, required=False)
parser.add_argument('-r', '--reportfile', help='report file (.txt) used to define which pdbs to query', default=None, type=str, required=False)
parser.add_argument('--step1', action=argparse.BooleanOptionalAction)
parser.add_argument('--step2', action=argparse.BooleanOptionalAction)
parser.add_argument('--report', action=argparse.BooleanOptionalAction)
args = parser.parse_args()
DATASET_CURRENT = "data/" + args.dataset
REPORT_FILE = args.reportfile
OUTPUT = args.output
STEP1 = args.step1
STEP2 = args.step2
REPORT = args.report
print(f'>>>>>>>> current database is at: {DATASET_CURRENT}')
if REPORT_FILE is not None:
print(f'>>>>>>> using {REPORT_FILE} as the report to query')
if args.lipidfile is not None:
LIPID_ANNOTATION = "data/" + args.lipidfile
print(f'>>>>>>> lipid annotation file is at: {LIPID_ANNOTATION}')
else:
LIPID_ANNOTATION = None
print(f'>>>>>>> no lipid annotation file is read')
if args.proteinfile_uni is not None:
PROTEIN_ANNOTATION_UNI = "data/" + args.proteinfile_uni
print(f'>>>>>>> protein annotation uniprot file is at: {PROTEIN_ANNOTATION_UNI}')
else:
PROTEIN_ANNOTATION_UNI = None
print(f'>>>>>>> no protein annotation uniprot file is read')
if args.proteinfile_deep is not None:
PROTEIN_ANNOTATION_DEEP = "data/" + args.proteinfile_deep
print(f'>>>>>>> protein annotation deeploc file is at: {PROTEIN_ANNOTATION_DEEP}')
else:
PROTEIN_ANNOTATION_DEEP = None
print(f'>>>>>> no protein annotation deeploc file is read')
print(f'>>>>>>>>> output prefix is set as: {OUTPUT}')
if STEP1 == True:
'''
Add new PDB entries by automatically search for PDBs associated with our lipid list
'''
print('--------------STEP1: Begin to update the database!------------------')
t_start = time.time()
# Read lipid annotation file.
print(f'>>>>>> Reading Lipid Annotation File')
if LIPID_ANNOTATION is not None:
print(f'reading lipid annotation file')
AnnoLipid = LipidAnnotation(LIPID_ANNOTATION, source="lipidfile")
AnnoLipid.GetLipid_df()
lipid_df = AnnoLipid.lipid_df
else:
print(f'generating lipid annotation file and saving it')
AnnoLipid = LipidAnnotation(DATASET_CURRENT, source="biodolphin")
AnnoLipid.GetLipid_df()
AnnoLipid.SaveLipidAnno(prefix=f"lipid_annotations_{datetime.date.today()}")
lipid_df = AnnoLipid.lipid_df
print('>>>>> Finished Reading Lipid Annotation File')
#print(f'columns of the lipid annotation file: {lipid_df.columns}')
# Query new dataset from PDB
print(f'>>>>>>> Querying New Dataset From The PDB Database')
query = QueryEntry(DATASET_CURRENT, LIPID_ANNOTATION, REPORT_FILE)
query.Run(f"newdata_{datetime.date.today()}")
new_query_df = query.new_dataset_df
# Combine datasets, add lipid annotations, assign IDs
all_df = Concat(pd.read_csv(DATASET_CURRENT, sep='\t'), new_query_df, lipid_df)
print('>>>>>> Finished Querying New Dataset')
# Query Extra protein annotations (from UniProt) and Prepare input files for DeepLoc for step2
print('>>>>> Start Processing Protein Annotation')
if PROTEIN_ANNOTATION_UNI is not None:
print('reading existing protein annotation uniprot file')
protein_df_current_uni = pd.read_csv(PROTEIN_ANNOTATION_UNI, sep='\t') # existing protein df (mapping uniprot Id or seq to extra info)
else:
print('process without existing protein annotation uniprot file')
protein_df_current_uni = None
if PROTEIN_ANNOTATION_DEEP is not None:
print('reading existing protein annotation deeploc file')
protein_df_current_deep = pd.read_csv(PROTEIN_ANNOTATION_DEEP, sep='\t') # existing protein df (mapping uniprot Id or seq to extra info)
else:
print('process without existing protein annotation deeploc file')
protein_df_current_deep = None
AnnoProtein = ProteinAnnotation(all_df, protein_df_current_uni, protein_df_current_deep)
all_df = AnnoProtein.Run(mode='queryUniprot', date=f'{datetime.date.today()}') # This will map uniprot annotations to all_df and produce deeploc input file
print('>>>>> Finished Processing Protein Annotation')
# Save the full dataframe
all_df.to_csv(f'data/STEP1_{OUTPUT}.txt',index=False,sep='\t')
all_df.to_csv(f'data/STEP1_{OUTPUT}.csv',index=False)
print(f'--------------STEP1: Finished! File Saved as STEP1_{OUTPUT}.txt------------------')
t_end = time.time()
print(f'run time for the step: {t_end - t_start} sec')
print(f'Follow the instructions above before starting Step 2')
elif STEP2 == True:
'''
Add extra protein annotation via deep loc
'''
print('------------------STEP2: Begin to add DeepLoc Information!------------------')
# Read DeepLoc Results
AnnoProtein = ProteinAnnotation(full_df=pd.read_csv(f'data/STEP1_{OUTPUT}.txt',sep='\t'))
df_final = AnnoProtein.Run(mode='addDeepLoc',date=f'{datetime.date.today()}') #read deeploc results and map it back to the full df, and output the final dataframe
# Create columns for average affinity
df_final = AverageAffinity(df_final)
# Get residue numbers if not aleady there
df_final = TagResNumber(df_final)
# Get LMSD ID with processed lipid annotation file # TODO:Delete this line in the future!
df_final = LMSD(df_final, lipidfile='data/lipid_annotations_vr1.1.txt')
# Format the dataframe
df_final = Format(df_final)
# print the final number of entries
print(f'number of rows in the new dataset: {df_final.shape[0]}')
# Create Result Directory
directory = './result'
os.makedirs(directory, exist_ok = True)
# save the final dataframe
df_final.to_csv(f'result/{OUTPUT}.txt',index=False,sep='\t')
df_final.to_csv(f'result/{OUTPUT}.csv',index=False)
print(f'------------------STEP2: Finished! Files Saved as {OUTPUT}.txt and {OUTPUT}.csv in /result------------------')
elif REPORT == True:
'''
Generate a report for missing entries
'''
print(f'Begin to prepare report for missing PDB entries associated with the lipid list provided in {LIPID_ANNOTATION}')
print('Note that PDBs listed in the report may not always be the entries we want to collect (ie.DNA interactions)')
query = QueryEntry(DATASET_CURRENT, LIPID_ANNOTATION)
query.Report()
else:
print('please specify flags for either --step1 or --step2 or --report')