Skip to content

Commit

Permalink
Update CMPB #8
Browse files Browse the repository at this point in the history
  • Loading branch information
cb-Hades committed Aug 19, 2024
1 parent 1dd792f commit b0677ac
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 90 deletions.
166 changes: 99 additions & 67 deletions src/specimen/cmpb/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,11 @@
from refinegems.analysis import growth
from refinegems.analysis.investigate import plot_rea_sbo_single
from refinegems.classes.reports import ModelInfoReport
from refinegems.curation.biomass import test_biomass_presence, check_normalise_biomass
from refinegems.utility.entities import test_biomass_presence
from refinegems.curation.biomass import check_normalise_biomass
from refinegems.curation.charges import correct_charges_modelseed
from refinegems.curation.curate import resolve_duplicates
from refinegems.curation.gapfill import gapfill_model, gapfill
from refinegems.classes.gapfill import KEGGapFiller, BioCycGapFiller, GeneGapFiller
from refinegems.curation.pathways import kegg_pathways, kegg_pathway_analysis
from refinegems.curation.polish import polish
from refinegems.utility.connections import run_memote, perform_mcc, adjust_BOF, run_SBOannotator
Expand Down Expand Up @@ -132,6 +133,7 @@ def between_analysis(model: Model, cfg:dict, step:str):
Path(dir,"cmpb_out",'logs').mkdir(parents=True, exist_ok=False) # |- logs
Path(dir,"cmpb_out",'misc').mkdir(parents=True, exist_ok=False) # |- misc
Path(dir,"cmpb_out",'misc', 'memote').mkdir(parents=True, exist_ok=False) # |- memote
Path(dir,"cmpb_out",'misc', 'gapfill').mkdir(parents=True, exist_ok=False) # |- gapfill
Path(dir,"cmpb_out",'misc', 'growth').mkdir(parents=True, exist_ok=False) # |- growth
Path(dir,"cmpb_out",'misc', 'stats').mkdir(parents=True, exist_ok=False) # |- stats
Path(dir,"cmpb_out",'misc', 'kegg_pathway').mkdir(parents=True, exist_ok=False) # |- kegg_pathways
Expand All @@ -146,9 +148,9 @@ def between_analysis(model: Model, cfg:dict, step:str):
#########
if not config['input']['modelpath']:
if config['carveme']['gram'] == "grampos" or config['carveme']['gram'] == "gramneg":
subprocess.run(["carve", config['cm-polish']['protein_fasta'], "--solver", "scip", '-u', config['carveme']['gram'], "-o", dir+"\cmpb_out\models\Draft.xml"])
subprocess.run(["carve", config['general']['protein_fasta'], "--solver", "scip", '-u', config['carveme']['gram'], "-o", dir+"\cmpb_out\models\Draft.xml"])
else:
subprocess.run(["carve", config['cm-polish']['protein_fasta'], "--solver", "scip", "-o", dir+"\cmpb_out\models\Draft.xml"])
subprocess.run(["carve", config['general']['protein_fasta'], "--solver", "scip", "-o", dir+"\cmpb_out\models\Draft.xml"])
config['input']['modelpath'] = dir+'\cmpb_out\models\Draft.xml'
current_modelpath = config['input']['modelpath']

Expand All @@ -159,10 +161,10 @@ def between_analysis(model: Model, cfg:dict, step:str):
if 'CarveMe' in current_libmodel.getAnnotationString():
Path(dir,"cmpb_out",'misc', 'wrong_annotations').mkdir(parents=True, exist_ok=False)
current_libmodel = polish(current_libmodel,
email = config['cm-polish']['email'],
email = config['tech-resources']['email'],
id_db = config['general']['namespace'],
refseq_gff = config['general']['refseq_gff'],
protein_fasta = config['cm-polish']['protein_fasta'],
protein_fasta = config['general']['protein_fasta'],
lab_strain = config['cm-polish']['is_lab_strain'],
kegg_organism_id = config['general']['kegg_organism_id'],
path = Path(dir,'cmpb_out','misc','wrong_annotations'))
Expand All @@ -188,68 +190,101 @@ def between_analysis(model: Model, cfg:dict, step:str):

# gapfilling
############
# options: automatic/manual extension/manual input
if config['gapfilling']['gap_fill_params']:

filename = str(Path(dir, 'cmpb_out', 'misc',f'{current_libmodel.getId()}_gap_analysis_{str(today)}'))
gapfilling_perfomed = True

match config['gapfilling']['gap_fill_params']['db_to_compare']:

case 'KEGG':
gap_analysis_result, current_libmodel = gapfill(
current_libmodel, 'KEGG', config['general']['refseq_gff'],
config['general']['kegg_organism_id'], None,
filename
)
case 'BioCyc':
gap_analysis_result, current_libmodel = gapfill(
current_libmodel, 'BioCyc', None, None, config['gapfilling']['gap_fill_params']['biocyc_files'],
filename
)
case 'KEGG+BioCyc':
gap_analysis_result, current_libmodel = gapfill(
current_libmodel, 'KEGG+BioCyc', config['general']['refseq_gff'],
config['general']['kegg_organism_id'], config['gapfilling']['gap_fill_params']['biocyc_files'],
filename
)
case 'USER' | None:
logging.info(f'Gapfilling skipped.')
gapfilling_perfomed = False
case _:
logging.warning(
f'''
\'{config["gapfilling"]["gap_fill_params"]["db_to_compare"]}\' is no valid input for the gap analysis.
Please specify for the parameter \'db_to_compare\' one of the following options:
KEGG | BioCyc | KEGG+BioCyc
'''
)

if gapfilling_perfomed:
logging.info(f'Gap analysis for {current_libmodel.getId()} with {config["gapfilling"]["gap_fill_params"]["db_to_compare"]} was performed.')
logging.info(f'Complete Excel table is in file: {filename}.xlsx.')

run_gapfill = False
# KEGGapFiller
if config['gapfilling']['KEGGapFiller']:
if config['general']['kegg_organism_id']:
run_gapfill = True
# gapfilling with KEGG via KEGG organism ID
kgf = KEGGapFiller(config['general']['kegg_organism_id'])
kgf.find_missing_genes(current_libmodel)
kgf.find_missing_reacs(current_model)
current_libmodel = kgf.fill_model(current_libmodel,
namespace = config['general']['namespace'],
idprefix = config['gapfilling']['idprefix'],
formula_check = config['gapfilling']['formula-check'],
exclude_dna = config['gapfilling']['formula-check'],
exclude_rna = config['gapfilling']['exclude-rna']
)
# save model
if config['general']['save_all_models']:
write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_autofilled_gaps.xml')))
current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_autofilled_gaps.xml')
write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')))
current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')
else:
write_model_to_file(current_libmodel, str(current_modelpath))

logging.info(f'Gaps were filled automatically in {current_libmodel.getId()}.')

if config['gapfilling']['gap_fill_file']:
current_libmodel = gapfill_model(current_libmodel, config['gapfilling']['gap_fill_file'])

write_model_to_file(current_libmodel, str(only_modelpath))
current_modelpath = only_modelpath
current_model = load_model(current_modelpath,'cobra')
else:
mes = f'No KEGG organism ID provided. Gapfilling with KEGG will be skipped.'
raise warnings.warn(mes,UserWarning)

# BioCycGapFiller
if config['gapfilling']['BioCycGapFiller']:
run_gapfill = True
bcgf = BioCycGapFiller(config['gapfilling']['BioCycGapFiller parameters']['gene-table'],
config['gapfilling']['BioCycGapFiller parameters']['reacs-table'],
config['gapfilling']['BioCycGapFiller parameters']['gff']
)
# @TEST if it works
bcgf.find_missing_genes(current_libmodel)
bcgf.find_missing_reacs(current_model)
current_libmodel = kgf.fill_model(current_libmodel,
namespace = config['general']['namespace'],
idprefix = config['gapfilling']['idprefix'],
formula_check = config['gapfilling']['formula-check'],
exclude_dna = config['gapfilling']['formula-check'],
exclude_rna = config['gapfilling']['exclude-rna']
)
# save model
if config['general']['save_all_models']:
write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_manually_filled_gaps.xml')))
current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_manually_filled_gaps.xml')
write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')))
current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')
else:
write_model_to_file(current_libmodel, str(current_modelpath))

logging.info(f'Gaps were filled based on a manually curated file in {current_libmodel.getId()}.')
write_model_to_file(current_libmodel, str(only_modelpath))
current_modelpath = only_modelpath
current_model = load_model(current_modelpath,'cobra')

# GeneGapFiller
if config['gapfilling']['GeneGapFiller']:
run_gapfill = True
ggf = GeneGapFiller()
ggf.find_missing_genes(config['gapfilling']['GeneGapFiller parameters']['gff'],
current_libmodel)
ggf.find_missing_reacs(current_model,
mail = config['tech-resources']['email'],
check_NCBI = config['gapfilling']['GeneGapFiller parameters']['check-NCBI'],
fasta = config['general']['protein_fasta'],
dmnd_db = config['gapfilling']['GeneGapFiller parameters']['swissprot-dmnd'],
swissprot_map = config['gapfilling']['GeneGapFiller parameters']['swissprot-mapping'],
outdir = Path(dir,"cmpb_out",'misc', 'gapfill'), # @TODO or would a subfolder be better?
sens = config['gapfilling']['GeneGapFiller parameters']['sensitivity'],
cov = config['gapfilling']['GeneGapFiller parameters']['coverage'],
t = config['tech-resources']['threads'],
pid = config['gapfilling']['GeneGapFiller parameters']['percentage identity']
)
current_libmodel = ggf.fill_model(current_libmodel,
namespace = config['general']['namespace'],
idprefix = config['gapfilling']['idprefix'],
formula_check = config['gapfilling']['formula-check'],
exclude_dna = config['gapfilling']['formula-check'],
exclude_rna = config['gapfilling']['exclude-rna']
)
# save model
if config['general']['save_all_models'] and run_gapfill:
write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')))
current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml')
current_model = load_model(current_modelpath,'cobra')
elif run_gapfill:
write_model_to_file(current_libmodel, str(only_modelpath))
current_modelpath = only_modelpath
current_model = load_model(current_modelpath,'cobra')
else:
pass # no gapfill

# testing
if run_gapfill:
between_growth_test(current_model,config,step='after_gapfill')
between_analysis(current_model, config, step='after_gapfill')

# ModelPolisher
###############
Expand All @@ -275,11 +310,8 @@ def between_analysis(model: Model, cfg:dict, step:str):

# SBOannotator
# ------------
# @TODO
# theoretically: something along the way:
#libsbml_doc = readSBML(current_modelpath)
#libsbml_model = libsbml_doc.getModel()
""" current_libmodel = load_model(str(current_modelpath),'libsbml')
# @TEST , if it works
current_libmodel = load_model(str(current_modelpath),'libsbml')
if config['general']['save_all_models']:
current_libmodel = run_SBOannotator(current_libmodel)
write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models', 'SBOannotated.xml')))
Expand All @@ -290,7 +322,7 @@ def between_analysis(model: Model, cfg:dict, step:str):

current_model = load_model(str(current_modelpath),'cobra')
between_analysis(current_model,config,step='after_annotation')
"""


# model cleanup
###############
Expand Down
Loading

0 comments on commit b0677ac

Please sign in to comment.