get_proteinAnnotation.py

'''
>Python
Functions to obtain protein annotaitons mapping 
(getting the annotations that either can't be obtained from PDB's API or easier to obtain from elsewhere )  

source database: UniProt
#TODO: currently haven't implemented functions to incoporate existing deeploc annotation file
'''
import pandas as pd
import numpy as np
import requests
import time
import json
import re
from requests.adapters import HTTPAdapter, Retry
from ast import literal_eval
import uuid
import os

#import traceback


class ProteinAnnotation:
    def __init__(self, full_df, protein_df_current_uni=None, protein_df_current_deep=None):
        '''
        input:
            - full_df: dataframe of the biodolphin full dataset (current + queried) that are mapped with lipid annotations
            - protein_df_current_uni: existing protein annotation file with uniprot information
            - protein_df_current_deep: existing protein annotation file with deeploc information
        '''        
        self.full_df = full_df # dataframe of the full dataset
        if full_df is not None:
            self.new_df = full_df[full_df["New_query"] == True] # dataframe of the subset that is newly queried
        self.protein_df_uni = protein_df_current_uni
        self.protein_df_deep = protein_df_current_deep
        self.uniprotIDs = None
        #self.protein_df = None


    def Run(self, mode, date):
        '''
        Get the UniProt protein annotation file 
        input:
            - mode: queryUniprot -> query uniprot info for the new uniprot IDs, and map all uniprot info onto the full dataframe, then prodict deeploc input file
                    #TODO: addDeepLoc -> add deeploc info for the new unprot IDs, combine current and queried protein annotation file and save it
        '''
        
        if mode == 'queryUniprot':
            # Query uniprot info and map it to the full dataframe
            self.GetIDList(self.protein_df_uni, save=True) # set self.uniprotIDs to a list of uniprot IDs that are not in the existing protein annotation uniprot file
            self.QueryUniProt() # query these uniprot ID from UniProt to get information, place the updated mapping at self.protein_df_uni, and update the uniprot mapping to self.full_df
            self.SaveProtAnno(self.protein_df_uni, prefix=f"protein_uniprotMap_{date}") # save the uniprot mapping information (self.protein_df_uni)
            self.MapUni(self.full_df, self.protein_df_uni) #map the uniprot information to the full dataframe and place it at self.full_df
            self.GetMemType_DeepLoc() # get the deep loc fasta file
            return self.full_df #return the updated full dataframe (with uniprot info mapped)
        
        elif mode == 'addDeepLoc':
            # Add DeepLoc results and map it to the full dataframe
            self.CombineDeepLoc() # combine the deeploc results with the unclass_protein.csv to produce protein deeploc annotation file (place at self.protein_df_deep)
            self.SaveProtAnno(self.protein_df_deep, prefix=f"protein_deepLocMap_{date}")
            self.MapDeep(self.full_df, self.protein_df_deep)
            
            #self.Update() #update the protein annotation file to include the new UniProt
            #self.SaveProtAnno(prefix="protein_annotations")
            return self.full_df #return the updated full dataframe (with deeploc info mapped)
        else:
            raise Exception('Invalid mode: {mode}')


    def GetIDList(self, protein_df_current_uni, save=False):
        '''
        Get a list of UniProt ID that needs to be annotated and save it to data/uniprotIDList.txt
        input:
            - protein_df_current_uni: dataframe of existing protein annotations
            - save: if True, save the ID list to data/uniprotID.txt
        result:
            - update: self.uniprotIDs to include a list of new uniprot ID
            (save the list of IDs as uniprotID.txt)
        
        '''
        print(f'getting the uniprotID list')

        #get unique set of uniprot IDs
        IDs = list(set(self.full_df["protein_UniProt_ID"].to_list()))
        #print(f'Number of unique UniProt IDs in full (current + queried) dataset: {len(IDs)}')

        if protein_df_current_uni is not None:
            old_IDs = list(set(protein_df_current_uni["protein_UniProt_ID"].to_list()))
            #print(f'Number of unique UniProt IDs in existing dataset: {len(old_IDs)}')
        else:
            old_IDs = []

        #remove nan and expand the ones with multiple IDs (only getting IDs that are not in the existing annotation file)
        IDs_format = []
        for ID in IDs:
            if (str(ID) != 'nan') and (ID is not None):
                if "," in ID:
                    IDs_format.extend(ID.split(","))
                else:
                    IDs_format.append(ID)
        
        self.uniprotIDs = list(set(IDs_format)-set(old_IDs))
        
        if save:
            with open("data/uniprotID.txt", 'w') as output:
                for row in self.uniprotIDs:
                    output.write(str(row) + '\n')
            
        
    def QueryUniProt(self):
        '''
        Take self.uniprotIDs (new UniProt IDs that need to be queried), get uniprot protein annotations (mapping uniprotIDs to their annotations)
        Save this in self.protein_df_uni
        '''
        if self.uniprotIDs == None:
            raise Exception("need to run funciton: GetIDList first")
        print(f'start querying {len(self.uniprotIDs)} uniprot IDs from UniProt')
        
        # define subfunctions >>> 
        # (source:https://www.uniprot.org/help/api_queries & https://www.uniprot.org/help/id_mapping)
        API_URL = "https://rest.uniprot.org"
        POLLING_INTERVAL = 3
        def get_next_link(headers):
            if "Link" in headers:
                match = re_next_link.match(headers["Link"])
                if match:
                    return match.group(1)

        def get_batch(batch_url):
            while batch_url:
                response = session.get(batch_url)
                response.raise_for_status()
                total = response.headers["x-total-results"]
                yield response, total
                batch_url = get_next_link(response.headers)

        def check_id_mapping_results_ready(job_id):
            while True:
                request = session.get(f"{API_URL}/idmapping/status/{job_id}")
                check_response(request)
                j = request.json()
                if "jobStatus" in j:
                    if j["jobStatus"] in ("NEW", "RUNNING"):
                        print(f"Job not ready: Retrying in {POLLING_INTERVAL}s")
                        time.sleep(POLLING_INTERVAL)
                    else:
                        raise Exception(j["jobStatus"])
                else:
                    return bool(j["results"] or j["failedIds"])

        def check_response(response):
            try:
                response.raise_for_status()
            except requests.HTTPError:
                print(response.json())
                raise
        # <<< define subfunctions

        # get the jobID corresponding to the IDs to be queried:
        IDstring = ','.join(self.uniprotIDs)    
        #print(f'IDstring: {IDstring}')

        files = {
            'from': (None, 'UniProtKB_AC-ID'),
            'to': (None, 'UniProtKB'),
            'ids': (None, IDstring) # IDs need to be seperated with comma without space here  #TODO: why not recognizing?
        }
        response = requests.post('https://rest.uniprot.org/idmapping/run', files=files)
        print('finished posting request')

        #Check reponse status before loading the request results
        re_next_link = re.compile(r'<(.+)>; rel="next"')
        retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
        session = requests.Session()
        session.mount("https://", HTTPAdapter(max_retries=retries))

        jobID = json.loads(response.text)['jobId']
        if check_id_mapping_results_ready(jobID):    
            # format the url with the jobID and the fields want:
            url=f'https://rest.uniprot.org/idmapping/uniprotkb/results/{jobID}?fields=accession%2Cxref_reactome%2Cxref_pfam%2Cxref_interpro%2Ccc_subcellular_location&format=json&size=500'
            print(f'url: {url}')
            
            # query from the API and put the data in a dataframe:       
            dfs = []
            for batch, total in get_batch(url):
                out_dict = json.loads(batch.text)
                for entry in out_dict['results']:
                    json_entry = entry['to']
                    json_parsed_df = self.Parse_json(json_entry)
                    dfs.append(json_parsed_df)
            df_all = pd.concat(dfs, ignore_index=True)
                        
            # classify the membrane type 
            df_all = self.MemType(df_all)
        
        # concatenate old and new protein annotations, and update it to self.protein_df_uni
        if self.protein_df_uni is not None:
            self.protein_df_uni = pd.concat([self.protein_df_uni, df_all], ignore_index=True)
        else:
            self.protein_df_uni = df_all
            
        
    def Parse_json(self,json_entry):
        '''
        subfunciton for QueryUniProt
        '''
        
        #uniprot ID
        uniprotID = json_entry['primaryAccession']

        #subcellular ID and key & subcellular topology ID and key
        subcell_ID, subcell_key = [], []
        topology_ID, topology_key = [], []
        try:
            comments = json_entry['comments']
            for sub_comment in comments:
                if sub_comment["commentType"] == "SUBCELLULAR LOCATION":
                    sub_info = sub_comment['subcellularLocations']
                    #print(f'sub_info: {sub_info}')
                    for item in sub_info:
                        #print(f'item: {item}')
                        subcell_key.append(item['location']['value'])
                        subcell_ID.append(item['location']['id'])
                        topology_key.append(item['topology']['value'])
                        topology_ID.append(item['topology']['id'])
        except:
            #traceback.print_exc()
            #print('unable to access comments for subcellular info')
            pass


        #protein family - InterPro and Pfam// Metabolic Pathway - Reactome
        interpro_ID, interpro_value, pfam_ID, pfam_value = [],[],[],[] 
        reactome_ID, reactome_value = [],[]


        try:
            crossref = json_entry['uniProtKBCrossReferences']
            
            for entry in crossref:
                #interpro
                try:
                    if entry['database'] == "InterPro":
                        #ID
                        interpro_ID.append(entry['id'])
                        #values
                        properties = entry['properties']
                        for prop_enty in properties:
                            if prop_enty['key'] == 'EntryName':
                                interpro_value.append(prop_enty['value'])
                except:
                    pass
                    #print('unable to access InterPro data')
                #pfam
                try:
                    if entry['database'] == "Pfam":
                        #ID
                        pfam_ID.append(entry['id'])
                        #values
                        properties = entry['properties']
                        for prop_enty in properties:
                            if prop_enty['key'] == 'EntryName':
                                pfam_value.append(prop_enty['value'])    
                except:
                    pass
                    #print('unable to access Pfam data')
                
                #reactome
                try:
                    if entry['database'] == "Reactome":
                        #ID
                        reactome_ID.append(entry['id'])
                        #values
                        properties = entry['properties']
                        for prop_enty in properties:
                            if prop_enty['key'] == 'PathwayName':
                                reactome_value.append(prop_enty['value'])    
                except:
                    pass
                    #print('unable to access reactome data')
                        

        except:
            pass
            #print('unable to access cross reference database data')
        
            
        list_entry = [str(uniprotID), str(subcell_ID), str(subcell_key), str(topology_ID),str(topology_key),str(interpro_ID), str(interpro_value), str(pfam_ID), str(pfam_value), str(reactome_ID), str(reactome_value)]
        cols = ['uniprotID', 'subcellular_ID', 'subcellular_Key', 'toplology_ID', 'topology_Key', 'InterPro_ID', 'InterPro_Key', 'Pfam_ID', 'Pfam_Key', 'Reactome_ID', 'Reactome_Key']
        
        '''
        if len(subcell_ID) != len(subcell_key) or len(interpro_ID) != len(interpro_value) or len(pfam_ID) != len(pfam_value) or len(reactome_ID) != len(reactome_value) or len(topology_ID) != len(topology_key):
            raise Exception(f"inconsistent length of ID and key. uniprotID: {uniprotID} ")
            #print('inconsistent length of ID and key')
        '''
        #assert len(subcell_ID) == len(subcell_key), f"{uniprotID} has inconsistent subcell length"
        assert len(topology_ID) == len(topology_key), f"{uniprotID} has inconsistent topology length, ID:{topology_ID}, key:{topology_key}"


        for i in range(len(list_entry)):
            if list_entry[i] == "[]":
                list_entry[i] = ""
                
        df_entry = pd.DataFrame([list_entry], columns=cols)

            
        return df_entry


    def MemType(self, df):
        '''
        Subfunction of QueryUniProt()
        Generate a dataframe with membrane type classification results without the original subloc details

        input:
            - df: the dataframe of results from uniprot
        output:
            - df_class: dataframe with membrane types
        
        '''
        #define membrane type mapping (SL-ID from: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/subcell.txt)
        peripheral_IDs = ['SL-9903']
        lipidanchor_IDs = ['SL-9901', 'SL-9902', 'SL-9920']
        transmembrane_IDs = ['SL-9909', 'SL-9904', 'SL-9905', 'SL-9906', 'SL-9907', 'SL-9908']
        soluble = ['SL-0170', 'SL-0215', 'SL-0309', 'SL-0086', 'SL0188', 'SL-0190', 'SL-0243', 'SL-0096', 'SL-0156', 'SL-0133', 'SL-0202', 'SL-0200']

        df_class = df.copy()
        def classify(subcellular_ID, toplology_ID):
            classes = []

            #if not pd.isna(toplology_ID):
            if not toplology_ID == "":
                for ID in literal_eval(toplology_ID):
                    if ID in peripheral_IDs:
                        classes.append("Peripheral")
                    if ID in lipidanchor_IDs:
                        classes.append("Lipid-anchor")
                    if ID in transmembrane_IDs:
                        classes.append("Transmembrane")

            if not subcellular_ID == "":
                for ID in literal_eval(subcellular_ID):
                    if ID in soluble:
                        classes.append("Soluble")
            
            if len(classes) == 0:
                return ""
            else:
                return list(set(classes))

        df_class['MembraneType_UniProt'] = df_class.apply(lambda x: classify(x.subcellular_ID, x.toplology_ID), axis=1)
        df_class.drop(columns=['subcellular_ID', 'subcellular_Key', 'toplology_ID', 'topology_Key'], inplace=True)

        return df_class
        

    def MapUni(self, full_dataframe, protein_uniprot_dataframe):
        '''
        Map the uniprot information to the full dataframe, and update the mapped dataframe as self.full_df
        input:
            - full_dataframe: full dataframe 
            - protein_uniprot_dataframe: protein uniprot annotation file
        '''
        full_dataframe['uniprot_single'] = full_dataframe['protein_UniProt_ID'].map(lambda x: x.split(',')[0] if (isinstance(x, str) and (',' in x)) else x) #get the first uniprot ID for mapping
        protein_uniprot_dataframe = protein_uniprot_dataframe.rename(columns={'uniprotID':'uniprot_single'}) #change the column name to map onto full_dataframe
        full_dataframe = full_dataframe.merge(protein_uniprot_dataframe, how='left', on='uniprot_single')
        full_dataframe.drop(columns=['uniprot_single'], inplace=True)
        
        #update self.full_df
        self.full_df = full_dataframe
        
        
    def GetMemType_DeepLoc(self):
        '''
        (1) Save a pre-deeploc annotation file (matching: seqID, sequences, uniprotIDs) that needs deepLoc to classify
            save this file as unclass_protein.csv (seqID as headers in the fasta file)
            # For entries with uniprotIDs, remove entries with repeating uniprotIDs
            # For entries without uniprotIDs, remove entries with repeating sequences
         
        (2) Get the fasta files for deeploc submission
        
        '''
        directory = 'data/DeepLoc'
        os.makedirs(directory, exist_ok = True)


        # Get a subset of full_df (unclassified_df) where the entries don't have already have the membrane classifications:
        self.full_df['MembraneType_UniProt'] = self.full_df['MembraneType_UniProt'].apply(lambda x: np.nan if isinstance(x, str) and (not x) else x) #replace empty strings with np.nan
        unclassified_df = self.full_df.loc[self.full_df['MembraneType_UniProt'].isna()]
        unclassified_df = unclassified_df[['protein_UniProt_ID', 'protein_Sequence']]

        # TODO: Delete the already classified entries (if there is existing deepLoc annotation file)


        # Process uniprotID and drop the duplicates for uniprotID
        unclassified_df['protein_UniProt_ID'] = unclassified_df['protein_UniProt_ID'].apply(lambda x: np.nan if isinstance(x, str) and (not x) else x) #replace empty strings with np.nan
        unclassified_df['protein_UniProt_ID'] = unclassified_df['protein_UniProt_ID'].map(lambda x: x.split(',')[0] if (isinstance(x, str) and (',' in x)) else x) #get the first uniprot ID for mapping
        unclassified_df = unclassified_df[(~unclassified_df['protein_UniProt_ID'].duplicated()) | (unclassified_df['protein_UniProt_ID'].isna())] #drop uniprot duplicates but don't drop nan
        
        # format the sequence to remove "()"
        unclassified_df['protein_Sequence'] = unclassified_df['protein_Sequence'].map(lambda x: x.replace('(','').replace(')',''))

        #keep a dictionary to map unique uniprotID to seqID
        dict_Uni2SeqID = {}
        for uniprotID in unclassified_df['protein_UniProt_ID'].to_list():
            if not pd.isnull(uniprotID):
                ID = uuid.uuid4()
                dict_Uni2SeqID[uniprotID] = ID
        
        #keep a dictionary to map unique sequences (without uniprotIDs) to seqID
        seq_withoutUniID = list(set(unclassified_df.loc[unclassified_df['protein_UniProt_ID'].isna(), 'protein_Sequence'].to_list()))
        dict_Seq2SeqID = {}
        for seq in seq_withoutUniID:
            ID = uuid.uuid4()
            dict_Seq2SeqID[seq] = ID
            
        
        # Generate the column seqID
        def getSeqID(uniprot, sequence):
            if (isinstance(uniprot, str)):
                if uniprot != "":
                    seqID = dict_Uni2SeqID[uniprot]
                    return seqID
            
            seqID = dict_Seq2SeqID[sequence]
            return seqID

        unclassified_df['seqID'] = unclassified_df.apply(lambda x: getSeqID(x.protein_UniProt_ID, x.protein_Sequence), axis=1)
        unclassified_df.drop_duplicates(subset=['seqID'], inplace=True) #elimnate the duplicates of the sequences from the entries without uniprot IDs
        unclassified_df.to_csv("data/DeepLoc/unclass_protein.csv") #save this file

        # Format them into a FASTA format and save it
        seqs = unclassified_df['protein_Sequence'].tolist()
        seqIDs = unclassified_df['seqID'].tolist()
        
        
        #save multiple fasta files with 500 sequences(max num of sequence is 500 on deeploc webserver)
        i=0
        for x in range(0, len(seqs), 500):
            i+=1
            sub_seqIDs, sub_seqs = seqIDs[x:x+500], seqs[x:x+500]
            with open(f"data/DeepLoc/deeplocInput_{i}.fasta", "w") as f:
                for id, seq in zip(sub_seqIDs, sub_seqs):
                    f.write(">" + str(id) + "\n" + seq  + "\n")
        print(f'FASTA file deeplocInput_<i>.fasta produced in data. Upload the files to https://services.healthtech.dtu.dk/services/DeepLoc-2.1/ select: slow model and short output')
        print(f'Please upload all the result csv files to data/DeepLoc')


    def GetMemType_DeepLoc_Old(self):
        '''
        Get the fasta file of all proteins without classifications and save unclass_protein.csv which maps uniprot IDs, sequence -->  sequenceID (as headers in the fasta file)
        '''
        directory = 'data/DeepLoc'
        os.makedirs(directory, exist_ok = True)


        # Get a list of UniProt IDs that already have the membrane classifications:
        protein_df = pd.read_csv("data/protein_uniprotMap.txt", sep='\t')
        protein_df['MembraneType_UniProt'] = protein_df['MembraneType_UniProt'].apply(lambda x: np.nan if isinstance(x, str) and (not x) else x) #replace empty strings with np.nan
        UniProt_classified = (protein_df.loc[~protein_df['MembraneType_UniProt'].isna(), 'uniprotID']).to_list() 
        
        # Get a list of UniProt Ids are in the existing protein annotation file
        if self.protein_df_current is not None:
            UniProt_Ignore = self.protein_df_current['protein_UniProt_ID'].to_list()
        else:
            UniProt_Ignore = []
            
        
        # Get a dataframe of unique protein sequences that need to be classified for membrane classifications:
        biodolphin_df_unclassified = self.full_df.loc[~self.full_df['protein_UniProt_ID'].isin(UniProt_classified), ['protein_UniProt_ID', 'protein_Sequence']] #not classified by uniprot already
        biodolphin_df_unclassified = biodolphin_df_unclassified.loc[~biodolphin_df_unclassified['protein_UniProt_ID'].isin(UniProt_Ignore)] # not in the protein annotation file already
        
        # format the sequence to remove "()"
        biodolphin_df_unclassified['protein_Sequence'] = biodolphin_df_unclassified['protein_Sequence'].map(lambda x: x.replace('(','').replace(')',''))
        
        # drop duplicates of protein sequence
        biodolphin_df_unclassified.drop_duplicates(subset=['protein_Sequence'],inplace=True) 
        # drop duplicates of uniprot ID (but keep ones without uniprot IDs)
        biodolphin_df_unclassified = biodolphin_df_unclassified[(~biodolphin_df_unclassified['protein_UniProt_ID'].duplicated()) | (biodolphin_df_unclassified['protein_UniProt_ID'].isna())]

        # assign sequence ID for matching
        seqIDs = [uuid.uuid4() for x in range(biodolphin_df_unclassified.shape[0])]       
        biodolphin_df_unclassified['sequenceID'] = seqIDs
    
        biodolphin_df_unclassified.to_csv("data/DeepLoc/unclass_protein.csv")
        

        # Format them into a FASTA format and save it
        seqs = biodolphin_df_unclassified['protein_Sequence'].tolist()
        
        #print(f"number of protien sequences:{len(seqs)}")
        
        #save multiple fasta files with 500 sequences(max num of sequence is 500 on deeploc webserver)
        i=0
        for x in range(0, len(seqs), 500):
            i+=1
            sub_seqIDs, sub_seqs = seqIDs[x:x+500], seqs[x:x+500]
            with open(f"data/DeepLoc/deeplocInput_{i}.fasta", "w") as f:
                for id, seq in zip(sub_seqIDs, sub_seqs):
                    f.write(">" + str(id) + "\n" + seq  + "\n")
        print(f'FASTA file deeplocInput_<i>.fasta produced in data. Upload the files to https://services.healthtech.dtu.dk/services/DeepLoc-2.1/ select: slow model and short output')
        print(f'Please upload all the result csv files to data/DeepLoc')


    def CombineDeepLoc(self):
        '''
        Read the results file from DeepLoc and incoporate the classifications
        Result: update self.protein_df_deep as the protein annotation file with deeploc information (seqID, uniprotID, sequence, MemClass-deeploc)

        '''
        # results of deeploc: get df_ID2Mem to map seqID to MembraneType_DeepLoc
        fn_results = [filename for filename in os.listdir('./data/DeepLoc') if filename.startswith("results_")]
        dfs_results = [pd.read_csv('./data/DeepLoc/'+fn) for fn in fn_results]
        df_ID2Mem = pd.concat(dfs_results)[["Protein_ID", "Membrane types"]].rename(columns={"Protein_ID": "seqID", "Membrane types":"MembraneType_DeepLoc"})
        df_ID2Mem['MembraneType_DeepLoc'] = df_ID2Mem['MembraneType_DeepLoc'].apply(lambda x: x.split("|"))
        
        # read the unclass_protein.csv (protein_UniProt_ID,protein_Sequence,seqID) and map MembraneType_DeepLoc onto it
        unclass_df = pd.read_csv('./data/DeepLoc/unclass_protein.csv',index_col=False)
        predict_df = unclass_df.merge(df_ID2Mem,how='left', on='seqID')  #predict_df: (protein_UniProt_ID, protein_Sequence, seqID, MembraneType_DeepLoc)
        
        self.protein_df_deep = predict_df
    
        
        #create entires for proteins with more than one uniprot IDs
        #predict_df['protein_UniProt_ID'] = predict_df['protein_UniProt_ID'].str.split(',')
        #predict_df = predict_df.explode('protein_UniProt_ID')
        #predict_df = predict_df[(~predict_df['protein_UniProt_ID'].duplicated()) | (predict_df['protein_UniProt_ID'].isna())] #drop duplicates
                
        # combine the prediction file and the uniprot annotated file as a final protein annotation file:
        # map the ones with uniprot IDs:
        #uniprot_df = pd.read_csv('data/protein_uniprotMap.txt', sep='\t')
        #uniprot_df = uniprot_df.rename(columns={'uniprotID':'protein_UniProt_ID'})
        #combinded_df = uniprot_df.merge(predict_df[["protein_UniProt_ID","MembraneType_DeepLoc"]], how='left', on='protein_UniProt_ID')
        #combinded_df['withUniProtID'] = True
        #combinded_df['protein_Sequence'] = ""
                
        # map the ones without uniprot IDs and place them to the dataframe too:
        #predict_df_sequence = predict_df[predict_df['protein_UniProt_ID'].isna()][["protein_Sequence","MembraneType_DeepLoc"]]
        #predict_df_sequence['withUniProtID'] = False
        #combinded_df = pd.concat([combinded_df, predict_df_sequence], ignore_index=True, sort=False)
        
        # rename columns with protein prefix:
        #combinded_df = combinded_df.rename(columns={"InterPro_ID": "protein_InterPro_ID", "InterPro_Key":"protein_InterPro_Key", "Pfam_ID":"protein_Pfam_ID", "Pfam_Key":"protein_Pfam_Key", "Reactome_ID":"protein_Reactome_ID", "Reactome_Key":"protein_Reactome_Key", "MembraneType_UniProt":"protein_MembraneType_UniProt", "MembraneType_DeepLoc":"protein_MembraneType_DeepLoc"})
        #combinded_df = combinded_df.loc[:, ~combinded_df.columns.str.contains('^Unnamed')] #remove extra index columns

        # place the combined dataframe to self.protein_df:
        #self.protein_df = combinded_df
        print('Finished combining protein annotation dataframes from deeploc and uniprot')
        
        
    def MapDeep(self, full_dataframe, protein_deep_dataframe):
        '''
        Map the uniprot information to the full dataframe, and update the mapped dataframe as self.full_df
        input:
            - full_dataframe: full dataframe 
            - protein_deep_dataframe: protein deeploc annotation file
        '''
        # map the deepLoc prediction from uniprotID (for the entries with uniprotID)
        full_dataframe['uniprot_single'] = full_dataframe['protein_UniProt_ID'].map(lambda x: x.split(',')[0] if (isinstance(x, str) and (',' in x)) else x) #get the first uniprot ID for mapping
        protein_df_uniprot = protein_deep_dataframe.loc[~protein_deep_dataframe['protein_UniProt_ID'].isna()] # subset of protein annotation file where the protein has UniProt IDs
        protein_df_uniprot = protein_df_uniprot[['protein_UniProt_ID', 'MembraneType_DeepLoc']] #get the needed columns
        protein_df_uniprot.rename(columns={'protein_UniProt_ID':'uniprot_single'}, inplace=True)
        full_dataframe = full_dataframe.merge(protein_df_uniprot, how='left', on='uniprot_single')
        full_dataframe.drop(columns=['uniprot_single'], inplace=True)


        # map the deepLoc prediction from sequence (for the entreis without uniprotID)
        full_dataframe['protein_Sequence_format'] = full_dataframe['protein_Sequence'].map(lambda x: x.replace('(','').replace(')',''))
        protein_df_sequence = protein_deep_dataframe.loc[protein_deep_dataframe['protein_UniProt_ID'].isna()] # subset of protein annotation file where the proteins don't have UniProt IDs (need to be matched by protein sequences)
        protein_df_sequence.rename(columns={'protein_Sequence':'protein_Sequence_format'}, inplace=True)
        protein_df_sequence = protein_df_sequence[['protein_Sequence_format', 'MembraneType_DeepLoc']]
        full_dataframe = full_dataframe.merge(protein_df_sequence, how='left', on='protein_Sequence_format')
        
        # process the repeating protein_MembraneType_DeepLoc from the two methods above

        def SelectDeepLoc(x, y):            
            if isinstance(x, list):
                return x
               
            else:
                return y
            
        
        full_dataframe['protein_MembraneType_DeepLoc'] = full_dataframe.apply(lambda x: SelectDeepLoc(x.MembraneType_DeepLoc_x, x.MembraneType_DeepLoc_y), axis=1)

        full_dataframe.drop(columns=['protein_Sequence_format'] , inplace=True)
        full_dataframe.drop(columns=['MembraneType_DeepLoc_x', 'MembraneType_DeepLoc_y'] , inplace=True)
        
        #update self.full_df
        self.full_df = full_dataframe
    
    
    def Format(self):
        '''
        Format the final dataframe
        Result:
            Place the formatted full dataframe to self.full_df
        '''
        df = self.full_df
        # remove columns
        df = df.drop(columns=['New_query'])

        # rename columns
        df = df.rename(columns={"InterPro_ID": "protein_InterPro_ID", "InterPro_Key": "protein_InterPro_Key", "Pfam_ID":"protein_Pfam_ID", "Pfam_Key":"protein_Pfam_Key", "Reactome_ID":"protein_Reactome_ID", "Reactome_Key":"protein_Reactome_Key", "MembraneType_UniProt":"protein_MembraneType_UniProt"})
        
        # adjust column orders
        df = df[['BioDolphinID', 'complex_PDB_ID', 'complex_Resolution', 'complex_Receptor_Chain', \
       'complex_Receptor_asymChain', 'complex_Ligand_Chain', \
       'complex_Ligand_Serial_Number', 'complex_Residue_number_of_the_ligand', \
       'complex_Binding_Site_Number_Code', \
       'complex_Binding_Site_Residues_(PDB)', \
       'complex_Binding_Site_Residues_(Re-numbered)', \
       'complex_Catalytic_Site_Residues_(PDB)', \
       'complex_Catalytic_Site_Residues_(Re-numbered)', \
       'complex_Binding_Affinity_(Manual_Survey)', \
       'complex_Binding_Affinity_(Binding_MOAD_database)', \
       'complex_Binding_Affinity_(PDBbind-CN_database)', \
       'complex_Binding_Affinity_(BindingDB_database)', 'complex_PubMed_ID', \
       'protein_UniProt_ID', 'protein_Name', 'protein_Synonyms', \
       'protein_Sequence', 'protein_EC_number', 'protein_Organism', \
       'protein_Biological_Process_(GO)', 'protein_Cellular_Component_(GO)', \
       'protein_Molecular_Function_(GO)', 'entry_source', 'protein_InterPro_ID', 'protein_InterPro_Key', \
       'protein_Pfam_ID', 'protein_Pfam_Key', 'protein_Reactome_ID', 'protein_Reactome_Key', \
       'protein_MembraneType_UniProt', 'protein_MembraneType_DeepLoc', \
       'lipid_Ligand_ID_CCD', 'lipid_PDB_Name', 'lipid_Synonyms', \
       'lipid_InChI', 'lipid_InChIKey', 'lipid_Canonical_smiles', \
       'lipid_Isomeric_smiles', 'lipid_IUPAC_name', 'lipid_Cid', \
       'lipid_drugbank', 'lipid_ChEBI', 'lipid_ChEMBL', \
       'lipid_Molecular_formula', 'lipid_Molecular_weight', \
       'lipid_Lipidmaps_categories', 'lipid_Lipidmaps_terms', \
       'lipid_class_source', 'lipid_pharmacological_class', 'lipid_adverse', \
       'lipid_disease']]

        self.full_df = df
        
        
    '''
    def Update(self):
        
        Combine old protein annotation file with new protein annotation file
        Result:
          Save the updated annotation file at self.protein_df
        
        new_protein_df = self.protein_df
        old_protein_df = self.protein_df_current
        if old_protein_df is not None:
            assert new_protein_df.columns == old_protein_df.columns
            self.protein_df = pd.concat([old_protein_df, new_protein_df])
        else:
            self.protein_df = new_protein_df
    '''


    def SaveProtAnno(self, df, prefix):
        '''
        save the processed protein annotation file stored in self.protein_df to protein_annotations.txt and protein_annotations.csv
        
        input:
            - df: protein annotation dataframe to save
            - prefix: prefix of the filename to save as
        
        results:
            - data/prefix.txt: mapping of protein information in txt format
            - data/prefix.csv: mapping of protein information in csv format
        '''

        protein_filepath = "data/"+prefix
        protein_filepath_txt, protein_filepath_csv = protein_filepath + ".txt", protein_filepath + ".csv"
        
        print(f'saving the protein mapping dataframe to: {protein_filepath_txt} and {protein_filepath_csv}')
        df.to_csv(protein_filepath_txt, index=False,sep='\t')
        df.to_csv(protein_filepath_csv, index=False)


if __name__ == "__main__":


    full_df = pd.read_csv('data/BioDolphin_vr1.0.txt', sep='\t')
    full_df["New_query"] = True


    protannotator = ProteinAnnotation(full_df)
    #protannotator.protein_df = pd.read_csv("data/protein_uniprotMap.txt", sep='\t')
    #protannotator.GetMemType_DeepLoc()
    
    df = protannotator.Run(mode='queryUniprot', date='test')
    df.to_csv('test_mapuni.csv')
    

    print('finished')
    

    '''
    # to get protein annotations from uniprot:
    t_start = time.time()
    
    print('defining protien annotation class:')
    protannotator = ProteinAnnotation('BioDolphin_vr1.0.txt')
    print('getting ID list')
    protannotator.GetIDList(save=True)
    print('querying from uniprot')
    protannotator.QueryUniProt()
    #print('saving the files')
    protannotator.SaveProtAnno()
    #print(len(protannotator.uniprotIDs))
    
    t_end = time.time()
    print(f'run time: {t_end - t_start} sec')
    '''