-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine.py
50 lines (30 loc) · 1.66 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
'''
>Python
Functions to (1) combine old and new data (2) map lipid annotations
'''
import pandas as pd
'''
Combine current dataset and new queried dataset, and map the lipid annotation file
'''
def Concat(current_df, new_df, lipid_df):
# set up new dataset, map lipid annotation file:
new_df['complex_Ligand_Serial_Number'] = 1
new_df['New_query'] = True
# combine current and new dataset and drop the existing lipid annotations:
df_combined = pd.concat([current_df,new_df])
df_combined.drop(columns=['lipid_PDB_Name', 'lipid_Synonyms', 'lipid_InChI', 'lipid_InChIKey', 'lipid_Canonical_smiles', 'lipid_Isomeric_smiles', 'lipid_IUPAC_name', 'lipid_Cid', 'lipid_drugbank', 'lipid_ChEBI', 'lipid_ChEMBL', 'lipid_Molecular_formula', 'lipid_Molecular_weight', 'lipid_Lipidmaps_categories', 'lipid_Lipidmaps_terms', 'lipid_class_source', 'lipid_LMSD_ID'], inplace=True)
# map lipid annotation
df_combined = df_combined.merge(lipid_df, on='lipid_Ligand_ID_CCD')
# Assign IDs for each entry
df_combined["BioDolphinID"] = df_combined.apply(lambda x: getID(x.complex_PDB_ID, x.complex_Receptor_Chain, x.complex_Ligand_Chain, x.lipid_Ligand_ID_CCD, x.complex_Ligand_Serial_Number), axis=1)
df_combined = df_combined.drop_duplicates(subset=['BioDolphinID'])
return df_combined
'''
Subfunction for Concat
'''
def getID(pdb, chain_rec, chain_lig, CCD_lig, serial_lig):
ID = f'BD{pdb}-{chain_rec}-{chain_lig}-{CCD_lig}{serial_lig}'
return ID
if __name__ == "__main__":
df = pd.read_csv('data/STEP1_BioDolphin_vr1.1.txt', sep='\t')
protein_df = pd.read_csv('data/protein_annotations_2024-09-06.txt', sep='\t')