main.py

'''
Main script curate new data and integrate them into BioDolphin

Assumption is:
    under data/, we have: (1) BioDolphin_vr1.0.txt (2) lipid_annotations.txt (3) protein_annotations.txt


Usage:
    (1) when complete lipid annotation file exist. Only need to query new PDB entries
    --> python main.py -d BioDolphin_vr1.0.txt -l lipid_annotations.txt -o BioDolphin_vr1.1.txt --update 
    
    
    (2) don't have lipid annotaiton file yet. 


python3 main.py [options]
'''


import argparse
from get_lipidAnnotation import *
from get_proteinAnnotation import *
from get_newEntries import *
from combine import *
from affinity import *
from get_format import *
import warnings
warnings.filterwarnings("ignore")


if __name__ == "__main__":
    # setup arguments
    parser = argparse.ArgumentParser(description='Add arguments')
    parser.add_argument('-d','--dataset', help='current dataset filename (.txt) in the data directory', default="BioDolphin_vr1.0.txt", type=str, required=False)
    parser.add_argument('-l','--lipidfile', help='lipid anotation file filename (.txt) in the data directory', default=None, type=str, required=False)
    parser.add_argument('-p','--proteinfile_uni', help='protein anotation (uniprot) file filename (.txt) in the data directory', default=None, type=str, required=False)
    parser.add_argument('-m','--proteinfile_deep', help='protein anotation (deeploc membrane class prediction) file filename (.txt) in the data directory', default=None, type=str, required=False)
    parser.add_argument('-o','--output', help='output dataset filename prefix in the data directory', type=str, required=False)
    parser.add_argument('-r', '--reportfile', help='report file (.txt) used to define which pdbs to query', default=None, type=str, required=False)
    parser.add_argument('--step1', action=argparse.BooleanOptionalAction)
    parser.add_argument('--step2', action=argparse.BooleanOptionalAction)
    parser.add_argument('--report', action=argparse.BooleanOptionalAction)

    args = parser.parse_args()
    
    
    DATASET_CURRENT = "data/" + args.dataset
    REPORT_FILE = args.reportfile
    
    OUTPUT = args.output
    STEP1 = args.step1
    STEP2 = args.step2
    REPORT = args.report
    
    
    print(f'>>>>>>>> current database is at: {DATASET_CURRENT}')


    if REPORT_FILE is not None:
        print(f'>>>>>>> using {REPORT_FILE} as the report to query')

    if args.lipidfile is not None:
        LIPID_ANNOTATION = "data/" + args.lipidfile
        print(f'>>>>>>> lipid annotation file is at: {LIPID_ANNOTATION}')
    else:
        LIPID_ANNOTATION = None
        print(f'>>>>>>> no lipid annotation file is read')
        
    if args.proteinfile_uni is not None:
        PROTEIN_ANNOTATION_UNI = "data/" + args.proteinfile_uni
        print(f'>>>>>>> protein annotation uniprot file is at: {PROTEIN_ANNOTATION_UNI}')
    else:
        PROTEIN_ANNOTATION_UNI = None
        print(f'>>>>>>> no protein annotation uniprot file is read')
        
    if args.proteinfile_deep is not None:
        PROTEIN_ANNOTATION_DEEP = "data/" + args.proteinfile_deep
        print(f'>>>>>>> protein annotation deeploc file is at: {PROTEIN_ANNOTATION_DEEP}')
    else:
        PROTEIN_ANNOTATION_DEEP = None
        print(f'>>>>>> no protein annotation deeploc file is read')
        
    print(f'>>>>>>>>> output prefix is set as: {OUTPUT}')
    

    if STEP1 == True: 
        '''
        Add new PDB entries by automatically search for PDBs associated with our lipid list 
        '''
        print('--------------STEP1: Begin to update the database!------------------')
        t_start = time.time()

        # Read lipid annotation file. 
        print(f'>>>>>> Reading Lipid Annotation File')
        if LIPID_ANNOTATION is not None:
            print(f'reading lipid annotation file')
            AnnoLipid = LipidAnnotation(LIPID_ANNOTATION, source="lipidfile")
            AnnoLipid.GetLipid_df()
            lipid_df = AnnoLipid.lipid_df
        else:
            print(f'generating lipid annotation file and saving it')
            AnnoLipid = LipidAnnotation(DATASET_CURRENT, source="biodolphin")
            AnnoLipid.GetLipid_df()
            AnnoLipid.SaveLipidAnno(prefix=f"lipid_annotations_{datetime.date.today()}")
            lipid_df = AnnoLipid.lipid_df
        print('>>>>> Finished Reading Lipid Annotation File')
        #print(f'columns of the lipid annotation file: {lipid_df.columns}')
            
            
        # Query new dataset from PDB
        print(f'>>>>>>> Querying New Dataset From The PDB Database')
        query = QueryEntry(DATASET_CURRENT, LIPID_ANNOTATION, REPORT_FILE)
        query.Run(f"newdata_{datetime.date.today()}")
        new_query_df = query.new_dataset_df
        # Combine datasets, add lipid annotations, assign IDs
        all_df = Concat(pd.read_csv(DATASET_CURRENT, sep='\t'), new_query_df, lipid_df)
        print('>>>>>> Finished Querying New Dataset')
        
        
        # Query Extra protein annotations (from UniProt) and Prepare input files for DeepLoc for step2
        print('>>>>> Start Processing Protein Annotation')
        if PROTEIN_ANNOTATION_UNI is not None:
            print('reading existing protein annotation uniprot file')
            protein_df_current_uni = pd.read_csv(PROTEIN_ANNOTATION_UNI, sep='\t') # existing protein df (mapping uniprot Id or seq to extra info)
        else:
            print('process without existing protein annotation uniprot file')
            protein_df_current_uni = None
            
        if PROTEIN_ANNOTATION_DEEP is not None:
            print('reading existing protein annotation deeploc file')
            protein_df_current_deep = pd.read_csv(PROTEIN_ANNOTATION_DEEP, sep='\t') # existing protein df (mapping uniprot Id or seq to extra info)
        else:
            print('process without existing protein annotation deeploc file')
            protein_df_current_deep = None
            
            
        AnnoProtein = ProteinAnnotation(all_df, protein_df_current_uni, protein_df_current_deep) 
        all_df = AnnoProtein.Run(mode='queryUniprot', date=f'{datetime.date.today()}') # This will map uniprot annotations to all_df and produce deeploc input file
        print('>>>>> Finished Processing Protein Annotation')
        
        
        # Save the full dataframe
        all_df.to_csv(f'data/STEP1_{OUTPUT}.txt',index=False,sep='\t')
        all_df.to_csv(f'data/STEP1_{OUTPUT}.csv',index=False)
        

        print(f'--------------STEP1: Finished! File Saved as STEP1_{OUTPUT}.txt------------------')
        t_end = time.time()
        print(f'run time for the step: {t_end - t_start} sec')
        print(f'Follow the instructions above before starting Step 2')
        
        
    elif STEP2 == True:
        '''
        Add extra protein annotation via deep loc 
        '''
        print('------------------STEP2: Begin to add DeepLoc Information!------------------')
        # Read DeepLoc Results
        AnnoProtein = ProteinAnnotation(full_df=pd.read_csv(f'data/STEP1_{OUTPUT}.txt',sep='\t'))
        df_final = AnnoProtein.Run(mode='addDeepLoc',date=f'{datetime.date.today()}') #read deeploc results and map it back to the full df, and output the final dataframe
        
        # Create columns for average affinity
        df_final = AverageAffinity(df_final) 

        # Get residue numbers if not aleady there
        df_final = TagResNumber(df_final)

        # Get LMSD ID with processed lipid annotation file   # TODO:Delete this line in the future!
        df_final = LMSD(df_final, lipidfile='data/lipid_annotations_vr1.1.txt')
        
        # Format the dataframe 
        df_final = Format(df_final)

        # print the final number of entries
        print(f'number of rows in the new dataset: {df_final.shape[0]}')
        
        # Create Result Directory
        directory = './result'
        os.makedirs(directory, exist_ok = True)

        # save the final dataframe
        df_final.to_csv(f'result/{OUTPUT}.txt',index=False,sep='\t')
        df_final.to_csv(f'result/{OUTPUT}.csv',index=False)
        
        print(f'------------------STEP2: Finished! Files Saved as {OUTPUT}.txt and {OUTPUT}.csv in /result------------------')
        
        
    elif REPORT == True:
        '''
        Generate a report for missing entries
        '''
        print(f'Begin to prepare report for missing PDB entries associated with the lipid list provided in {LIPID_ANNOTATION}')
        print('Note that PDBs listed in the report may not always be the entries we want to collect (ie.DNA interactions)')
        
        query = QueryEntry(DATASET_CURRENT, LIPID_ANNOTATION)
        query.Report()

    else:
        print('please specify flags for either --step1 or --step2 or --report')