Integrate ModelPolisher in CMPB #8

draeger-lab · Nov 29, 2024 · 8f0efab · 8f0efab
1 parent 225d207
commit 8f0efab
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 11 deletions.
diff --git a/src/specimen/cmpb/workflow.py b/src/specimen/cmpb/workflow.py
@@ -11,6 +11,7 @@
 import logging
 import pandas as pd
 from datetime import date
+import model_polisher as mp
 from pathlib import Path
 from typing import Union
 
@@ -140,6 +141,7 @@ def between_analysis(model: Model, cfg:dict, step:str):
     Path(dir,"cmpb_out",'misc', 'gapfill').mkdir(parents=True, exist_ok=False)        #      |- gapfill
     Path(dir,"cmpb_out",'misc', 'growth').mkdir(parents=True, exist_ok=False)         #      |- growth
     Path(dir,"cmpb_out",'misc', 'stats').mkdir(parents=True, exist_ok=False)          #      |- stats
+    Path(dir,"cmpb_out",'misc', 'modelpolisher').mkdir(parents=True, exist_ok=False)  #      |- modelpolisher
     Path(dir,"cmpb_out",'misc', 'kegg_pathway').mkdir(parents=True, exist_ok=False)   #      |- kegg_pathways
     Path(dir,"cmpb_out",'misc', 'auxotrophy').mkdir(parents=True, exist_ok=False)     #      |- auxothrophy
 
@@ -216,7 +218,8 @@ def between_analysis(model: Model, cfg:dict, step:str):
                 current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')
             else:
                 write_model_to_file(current_libmodel, str(only_modelpath))
-            current_model = load_model(current_modelpath,'cobra')
+                current_modelpath = only_modelpath
+            current_model = load_model(str(current_modelpath),'cobra')
         else:
             mes = f'No KEGG organism ID provided. Gapfilling with KEGG will be skipped.'
             raise warnings.warn(mes,UserWarning)
@@ -244,7 +247,8 @@ def between_analysis(model: Model, cfg:dict, step:str):
             current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_BioCyc_gapfill.xml')
         else:
             write_model_to_file(current_libmodel, str(only_modelpath))
-        current_model = load_model(current_modelpath,'cobra')
+            current_modelpath = only_modelpath
+        current_model = load_model(str(current_modelpath),'cobra')
 
     # GeneGapFiller
     if config['gapfilling']['GeneGapFiller']:
@@ -275,10 +279,11 @@ def between_analysis(model: Model, cfg:dict, step:str):
         if config['general']['save_all_models']:
             write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_Gene_gapfill.xml')))     
             current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_Gene_gapfill.xml')
-            current_model = load_model(current_modelpath,'cobra')
         else:
             write_model_to_file(current_libmodel, str(only_modelpath))
-            current_model = load_model(current_modelpath,'cobra')
+            current_modelpath = only_modelpath
+
+    current_model = load_model(str(current_modelpath),'cobra')
 
     # testing
     if run_gapfill:
@@ -287,10 +292,32 @@ def between_analysis(model: Model, cfg:dict, step:str):
 
     # ModelPolisher
     ###############
-    # @TODO
-    # future update
-    # currently being revamped 
-    # and python access is coming soon
+    if config['modelpolisher']:
+        config_mp = {"allow-model-to-be-saved-on-server": config["mp"]["allow-model-to-be-saved-on-server"], 
+                         "fixing": {"dont-fix": config["mp"]["fixing"]["dont-fix"]},
+                         "annotation": {"bigg": {"annotate-with-bigg": config["mp"]["annotation"]["bigg"]["annotate-with-bigg"], 
+                                                 "include-any-uri": config["mp"]["annotation"]["bigg"]["include-any-uri"]}}}
+
+        result = mp.polish_model_file(current_modelpath, config_mp)
+
+        # @DISCUSSION Should the run-id be saved somewhere for debugging purposes? result['run_id']
+        pd.DataFrame(result['diff']).to_csv(Path(dir,'cmpb_out','misc','modelpolisher','diff_mp.csv'), sep=';', header=False)
+        pd.DataFrame(result['pre_validation']).to_csv(Path(dir,'cmpb_out','misc','modelpolisher','pre_validation.csv'), sep=';', header=True)
+        pd.DataFrame(result['post_validation']).to_csv(Path(dir,'cmpb_out','misc','modelpolisher','post_validation.csv'), sep=';', header=True)
+
+        # save model
+        if config['general']['save_all_models']:
+            write_model_to_file(result["polished_document"].getModel(), str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_ModelPolisher.xml')))     
+            current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_ModelPolisher.xml')
+        else:
+            write_model_to_file(result["polished_document"].getModel(), str(only_modelpath))
+            current_modelpath = only_modelpath
+
+        current_model = load_model(str(current_modelpath),'cobra')
+
+        # in-between testing
+        between_growth_test(current_model,config,step='after_ModelPolisher')
+        between_analysis(current_model, config, step='after_ModelPolisher')
 
     # Annotations
     #############
@@ -307,7 +334,8 @@ def between_analysis(model: Model, cfg:dict, step:str):
             write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_added_KeggPathwayGroups.xml')))
             current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_added_KeggPathwayGroups.xml')
         else:
-            write_model_to_file(current_libmodel, str(current_modelpath))
+            write_model_to_file(current_libmodel, str(only_modelpath))
+            current_modelpath = only_modelpath
 
     # SBOannotator
     # ------------
@@ -319,7 +347,8 @@ def between_analysis(model: Model, cfg:dict, step:str):
         current_modelpath = Path(dir,'cmpb_out','models', f'{current_libmodel.getId()}_SBOannotated.xml')
     else:
         current_libmodel = run_SBOannotator(current_libmodel)
-        write_model_to_file(current_libmodel, str(current_modelpath))
+        write_model_to_file(current_libmodel, str(only_modelpath))
+        current_modelpath = only_modelpath
 
     current_model = load_model(str(current_modelpath),'cobra')
     between_analysis(current_model,config,step='after_annotation')
@@ -413,7 +442,8 @@ def between_analysis(model: Model, cfg:dict, step:str):
         write_model_to_file(current_model, str(Path(dir,'cmpb_out','models',f'{current_model.id}_after_BOF.xml')))
         current_modelpath = Path(dir,'cmpb_out','models',f'{current_model.id}_after_BOF.xml')
     else:
-        write_model_to_file(current_model, str(current_modelpath))
+        write_model_to_file(current_model, str(only_modelpath))
+        current_modelpath = only_modelpath
 
     # MCC
     # ---

diff --git a/src/specimen/data/config/cmpb_config.yaml b/src/specimen/data/config/cmpb_config.yaml
@@ -105,6 +105,17 @@ gapfilling:
     percentage identity: 90.0     # Percentage identity threshold value for accepting
                                   # matches found by DIAMOND as homologous.
 
+# Polish the model using ModelPolisher
+# ------------------------------------
+modelpolisher: True                         # Activate ModelPolisher
+mp:
+  allow-model-to-be-saved-on-server: False  # Enable saving the model on the server
+  fixing:
+    dont-fix: False                         # Sets unset default values that are mandatory
+  annotation:
+    bigg:
+      annotate-with-bigg: True              # Enable annotation with BiGG
+      include-any-uri: True                 # Enable adding annotations that are not MIRIAM-compliant
 
 # Add KEGG pathways as groups, optional
 # -------------------------------------

diff --git a/src/specimen/util/set_up.py b/src/specimen/util/set_up.py
@@ -578,6 +578,28 @@ def save_cmpb_user_input(configpath:Union[str,None]=None) -> dict:
             if another_gapfiller:
                 algorithm = click.prompt('Which algorithm do you want to use for gapfilling?', type=click.Choice(['KEGGapFiller','BioCycGapFiller','GeneGapFiller']), show_choices=True)
 
+    # ModelPolisher
+    modelpolisher = click.prompt('Do you want to run ModelPolisher?', type=click.Choice(['y','n']), show_choices=True)
+    match modelpolisher:
+        case 'y':
+            config['modelpolisher'] = True
+            allow_model_to_be_saved_on_server = click.prompt('Do you want to allow the model to be saved on the server?', type=click.Choice(['y','n']), show_choices=True)
+            allow_model_to_be_saved_on_server = True if allow_model_to_be_saved_on_server == 'y' else False
+            config['mp']['allow-model-to-be-saved-on-server'] = allow_model_to_be_saved_on_server
+
+            dont_fix = click.prompt('Do you want to fix the model? Unset default values will be set, if they are mandatory.', type=click.Choice(['y','n']), show_choices=True)
+            dont_fix = False if dont_fix == 'y' else True
+            config['mp']['fixing']['dont-fix'] = dont_fix
+
+            annotate_with_bigg = click.prompt('Do you want to annotate with BiGG?', type=click.Choice(['y','n']), show_choices=True)
+            annotate_with_bigg = True if annotate_with_bigg == 'y' else False
+            config['mp']['annotation']['bigg']['annotate-with-bigg'] = annotate_with_bigg
+            include_any_uri = click.prompt('Do you want to include annotation that are not MIRIAM-compliant?', type=click.Choice(['y','n']), show_choices=True)
+            include_any_uri = True if include_any_uri == 'y' else False
+            config['mp']['annotation']['bigg']['include-any-uri'] = include_any_uri
+        case 'n':
+            config['modelpolisher'] = False
+
     # kegg pathways as groups
     kegg_pw_groups = click.prompt('Do you want to add KEGG pathways as groups to the model?', type=click.Choice(['y','n']), show_choices=True)
     match kegg_pw_groups: