Update CMPB #8

draeger-lab · Aug 19, 2024 · b0677ac · b0677ac
1 parent 1dd792f
commit b0677ac
Show file tree

Hide file tree

Showing 3 changed files with 152 additions and 90 deletions.
diff --git a/src/specimen/cmpb/workflow.py b/src/specimen/cmpb/workflow.py
@@ -22,10 +22,11 @@
 from refinegems.analysis import growth
 from refinegems.analysis.investigate import plot_rea_sbo_single
 from refinegems.classes.reports import ModelInfoReport
-from refinegems.curation.biomass import test_biomass_presence, check_normalise_biomass
+from refinegems.utility.entities import test_biomass_presence
+from refinegems.curation.biomass import check_normalise_biomass
 from refinegems.curation.charges import correct_charges_modelseed
 from refinegems.curation.curate import resolve_duplicates
-from refinegems.curation.gapfill import gapfill_model, gapfill
+from refinegems.classes.gapfill import KEGGapFiller, BioCycGapFiller, GeneGapFiller
 from refinegems.curation.pathways import kegg_pathways, kegg_pathway_analysis
 from refinegems.curation.polish import polish
 from refinegems.utility.connections import run_memote, perform_mcc, adjust_BOF, run_SBOannotator
@@ -132,6 +133,7 @@ def between_analysis(model: Model, cfg:dict, step:str):
     Path(dir,"cmpb_out",'logs').mkdir(parents=True, exist_ok=False)                   #   |- logs
     Path(dir,"cmpb_out",'misc').mkdir(parents=True, exist_ok=False)                   #   |- misc
     Path(dir,"cmpb_out",'misc', 'memote').mkdir(parents=True, exist_ok=False)         #      |- memote
+    Path(dir,"cmpb_out",'misc', 'gapfill').mkdir(parents=True, exist_ok=False)        #      |- gapfill
     Path(dir,"cmpb_out",'misc', 'growth').mkdir(parents=True, exist_ok=False)         #      |- growth
     Path(dir,"cmpb_out",'misc', 'stats').mkdir(parents=True, exist_ok=False)          #      |- stats
     Path(dir,"cmpb_out",'misc', 'kegg_pathway').mkdir(parents=True, exist_ok=False)   #      |- kegg_pathways
@@ -146,9 +148,9 @@ def between_analysis(model: Model, cfg:dict, step:str):
     #########
     if not config['input']['modelpath']:
         if config['carveme']['gram'] == "grampos" or config['carveme']['gram'] == "gramneg":
-            subprocess.run(["carve", config['cm-polish']['protein_fasta'], "--solver", "scip", '-u', config['carveme']['gram'], "-o", dir+"\cmpb_out\models\Draft.xml"])
+            subprocess.run(["carve", config['general']['protein_fasta'], "--solver", "scip", '-u', config['carveme']['gram'], "-o", dir+"\cmpb_out\models\Draft.xml"])
         else: 
-            subprocess.run(["carve", config['cm-polish']['protein_fasta'], "--solver", "scip", "-o", dir+"\cmpb_out\models\Draft.xml"])
+            subprocess.run(["carve", config['general']['protein_fasta'], "--solver", "scip", "-o", dir+"\cmpb_out\models\Draft.xml"])
         config['input']['modelpath'] = dir+'\cmpb_out\models\Draft.xml'
     current_modelpath = config['input']['modelpath']
 
@@ -159,10 +161,10 @@ def between_analysis(model: Model, cfg:dict, step:str):
     if 'CarveMe' in current_libmodel.getAnnotationString():
         Path(dir,"cmpb_out",'misc', 'wrong_annotations').mkdir(parents=True, exist_ok=False)
         current_libmodel = polish(current_libmodel, 
-                                  email = config['cm-polish']['email'],
+                                  email = config['tech-resources']['email'],
                                   id_db = config['general']['namespace'],
                                   refseq_gff = config['general']['refseq_gff'],
-                                  protein_fasta = config['cm-polish']['protein_fasta'],
+                                  protein_fasta = config['general']['protein_fasta'],
                                   lab_strain = config['cm-polish']['is_lab_strain'],
                                   kegg_organism_id = config['general']['kegg_organism_id'],
                                   path = Path(dir,'cmpb_out','misc','wrong_annotations'))
@@ -188,68 +190,101 @@ def between_analysis(model: Model, cfg:dict, step:str):
 
     # gapfilling
     ############
-    # options: automatic/manual extension/manual input
-    if config['gapfilling']['gap_fill_params']: 
-
-        filename = str(Path(dir, 'cmpb_out', 'misc',f'{current_libmodel.getId()}_gap_analysis_{str(today)}'))
-        gapfilling_perfomed = True
-
-        match config['gapfilling']['gap_fill_params']['db_to_compare']:
-
-            case 'KEGG':
-                gap_analysis_result, current_libmodel = gapfill(
-                    current_libmodel, 'KEGG', config['general']['refseq_gff'], 
-                    config['general']['kegg_organism_id'], None, 
-                    filename
-                    )
-            case 'BioCyc':
-                gap_analysis_result, current_libmodel = gapfill(
-                    current_libmodel, 'BioCyc', None, None, config['gapfilling']['gap_fill_params']['biocyc_files'], 
-                    filename
-                    )
-            case 'KEGG+BioCyc':
-                gap_analysis_result, current_libmodel = gapfill(
-                    current_libmodel, 'KEGG+BioCyc', config['general']['refseq_gff'],
-                    config['general']['kegg_organism_id'], config['gapfilling']['gap_fill_params']['biocyc_files'], 
-                    filename
-                    )
-            case 'USER' | None:
-                logging.info(f'Gapfilling skipped.')
-                gapfilling_perfomed = False
-            case _:
-                logging.warning(
-                    f'''
-                    \'{config["gapfilling"]["gap_fill_params"]["db_to_compare"]}\' is no valid input for the gap analysis.
-                    Please specify for the parameter \'db_to_compare\' one of the following options: 
-                    KEGG | BioCyc | KEGG+BioCyc
-                    '''
-                    )
-
-        if gapfilling_perfomed:
-            logging.info(f'Gap analysis for {current_libmodel.getId()} with {config["gapfilling"]["gap_fill_params"]["db_to_compare"]} was performed.')
-            logging.info(f'Complete Excel table is in file: {filename}.xlsx.')
-
+    run_gapfill = False
+    # KEGGapFiller
+    if config['gapfilling']['KEGGapFiller']:
+        if config['general']['kegg_organism_id']:
+            run_gapfill = True
+            # gapfilling with KEGG via KEGG organism ID
+            kgf = KEGGapFiller(config['general']['kegg_organism_id'])
+            kgf.find_missing_genes(current_libmodel)
+            kgf.find_missing_reacs(current_model)
+            current_libmodel = kgf.fill_model(current_libmodel, 
+                                              namespace = config['general']['namespace'],
+                                              idprefix = config['gapfilling']['idprefix'],
+                                              formula_check = config['gapfilling']['formula-check'],
+                                              exclude_dna = config['gapfilling']['formula-check'],
+                                              exclude_rna = config['gapfilling']['exclude-rna']
+                                              )
             # save model
             if config['general']['save_all_models']:
-                write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_autofilled_gaps.xml')))
-                current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_autofilled_gaps.xml')
+                write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')))     
+                current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')
             else:
-                write_model_to_file(current_libmodel, str(current_modelpath))
-
-            logging.info(f'Gaps were filled automatically in {current_libmodel.getId()}.')
-
-    if config['gapfilling']['gap_fill_file']: 
-        current_libmodel = gapfill_model(current_libmodel, config['gapfilling']['gap_fill_file'])
-
+                write_model_to_file(current_libmodel, str(only_modelpath))
+                current_modelpath = only_modelpath
+            current_model = load_model(current_modelpath,'cobra')
+        else:
+            mes = f'No KEGG organism ID provided. Gapfilling with KEGG will be skipped.'
+            raise warnings.warn(mes,UserWarning)
+
+    # BioCycGapFiller
+    if config['gapfilling']['BioCycGapFiller']:
+        run_gapfill = True
+        bcgf = BioCycGapFiller(config['gapfilling']['BioCycGapFiller parameters']['gene-table'],
+                               config['gapfilling']['BioCycGapFiller parameters']['reacs-table'],
+                               config['gapfilling']['BioCycGapFiller parameters']['gff']
+                               )
+        # @TEST if it works
+        bcgf.find_missing_genes(current_libmodel)
+        bcgf.find_missing_reacs(current_model)
+        current_libmodel = kgf.fill_model(current_libmodel, 
+                                          namespace = config['general']['namespace'],
+                                          idprefix = config['gapfilling']['idprefix'],
+                                          formula_check = config['gapfilling']['formula-check'],
+                                          exclude_dna = config['gapfilling']['formula-check'],
+                                          exclude_rna = config['gapfilling']['exclude-rna']
+                                         )
         # save model
         if config['general']['save_all_models']:
-            write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_manually_filled_gaps.xml')))
-            current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_manually_filled_gaps.xml')
+            write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')))     
+            current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')
         else:
-            write_model_to_file(current_libmodel, str(current_modelpath))
-
-        logging.info(f'Gaps were filled based on a  manually curated file in {current_libmodel.getId()}.')
+            write_model_to_file(current_libmodel, str(only_modelpath))
+            current_modelpath = only_modelpath
+        current_model = load_model(current_modelpath,'cobra')
 
+    # GeneGapFiller
+    if config['gapfilling']['GeneGapFiller']:
+        run_gapfill = True
+        ggf = GeneGapFiller()
+        ggf.find_missing_genes(config['gapfilling']['GeneGapFiller parameters']['gff'],
+                               current_libmodel)
+        ggf.find_missing_reacs(current_model,
+                               mail = config['tech-resources']['email'],
+                               check_NCBI = config['gapfilling']['GeneGapFiller parameters']['check-NCBI'],
+                               fasta = config['general']['protein_fasta'],
+                               dmnd_db = config['gapfilling']['GeneGapFiller parameters']['swissprot-dmnd'],
+                               swissprot_map = config['gapfilling']['GeneGapFiller parameters']['swissprot-mapping'],
+                               outdir = Path(dir,"cmpb_out",'misc', 'gapfill'), # @TODO or would a subfolder be better?
+                               sens = config['gapfilling']['GeneGapFiller parameters']['sensitivity'],
+                               cov = config['gapfilling']['GeneGapFiller parameters']['coverage'],
+                               t = config['tech-resources']['threads'],
+                               pid = config['gapfilling']['GeneGapFiller parameters']['percentage identity'] 
+                               )
+        current_libmodel = ggf.fill_model(current_libmodel, 
+                                          namespace = config['general']['namespace'],
+                                          idprefix = config['gapfilling']['idprefix'],
+                                          formula_check = config['gapfilling']['formula-check'],
+                                          exclude_dna = config['gapfilling']['formula-check'],
+                                          exclude_rna = config['gapfilling']['exclude-rna']
+                                         )
+        # save model
+        if config['general']['save_all_models'] and run_gapfill:
+            write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')))     
+            current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')
+            current_model = load_model(current_modelpath,'cobra')
+        elif run_gapfill:
+            write_model_to_file(current_libmodel, str(only_modelpath))
+            current_modelpath = only_modelpath
+            current_model = load_model(current_modelpath,'cobra')
+        else: 
+            pass # no gapfill
+
+    # testing
+    if run_gapfill:
+        between_growth_test(current_model,config,step='after_gapfill')
+        between_analysis(current_model, config, step='after_gapfill')
 
     # ModelPolisher
     ###############
@@ -275,11 +310,8 @@ def between_analysis(model: Model, cfg:dict, step:str):
 
     # SBOannotator
     # ------------
-    # @TODO 
-    #  theoretically: something along the way:
-    #libsbml_doc = readSBML(current_modelpath)
-    #libsbml_model = libsbml_doc.getModel()
-    """ current_libmodel = load_model(str(current_modelpath),'libsbml')
+    # @TEST , if it works
+    current_libmodel = load_model(str(current_modelpath),'libsbml')
     if config['general']['save_all_models']:
         current_libmodel = run_SBOannotator(current_libmodel)
         write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models', 'SBOannotated.xml')))
@@ -290,7 +322,7 @@ def between_analysis(model: Model, cfg:dict, step:str):
 
     current_model = load_model(str(current_modelpath),'cobra')
     between_analysis(current_model,config,step='after_annotation')
-     """
+
 
     # model cleanup
     ###############