diff --git a/src/specimen/cmpb/workflow.py b/src/specimen/cmpb/workflow.py index 79875c7..ac376a1 100644 --- a/src/specimen/cmpb/workflow.py +++ b/src/specimen/cmpb/workflow.py @@ -22,10 +22,11 @@ from refinegems.analysis import growth from refinegems.analysis.investigate import plot_rea_sbo_single from refinegems.classes.reports import ModelInfoReport -from refinegems.curation.biomass import test_biomass_presence, check_normalise_biomass +from refinegems.utility.entities import test_biomass_presence +from refinegems.curation.biomass import check_normalise_biomass from refinegems.curation.charges import correct_charges_modelseed from refinegems.curation.curate import resolve_duplicates -from refinegems.curation.gapfill import gapfill_model, gapfill +from refinegems.classes.gapfill import KEGGapFiller, BioCycGapFiller, GeneGapFiller from refinegems.curation.pathways import kegg_pathways, kegg_pathway_analysis from refinegems.curation.polish import polish from refinegems.utility.connections import run_memote, perform_mcc, adjust_BOF, run_SBOannotator @@ -132,6 +133,7 @@ def between_analysis(model: Model, cfg:dict, step:str): Path(dir,"cmpb_out",'logs').mkdir(parents=True, exist_ok=False) # |- logs Path(dir,"cmpb_out",'misc').mkdir(parents=True, exist_ok=False) # |- misc Path(dir,"cmpb_out",'misc', 'memote').mkdir(parents=True, exist_ok=False) # |- memote + Path(dir,"cmpb_out",'misc', 'gapfill').mkdir(parents=True, exist_ok=False) # |- gapfill Path(dir,"cmpb_out",'misc', 'growth').mkdir(parents=True, exist_ok=False) # |- growth Path(dir,"cmpb_out",'misc', 'stats').mkdir(parents=True, exist_ok=False) # |- stats Path(dir,"cmpb_out",'misc', 'kegg_pathway').mkdir(parents=True, exist_ok=False) # |- kegg_pathways @@ -146,9 +148,9 @@ def between_analysis(model: Model, cfg:dict, step:str): ######### if not config['input']['modelpath']: if config['carveme']['gram'] == "grampos" or config['carveme']['gram'] == "gramneg": - subprocess.run(["carve", config['cm-polish']['protein_fasta'], "--solver", "scip", '-u', config['carveme']['gram'], "-o", dir+"\cmpb_out\models\Draft.xml"]) + subprocess.run(["carve", config['general']['protein_fasta'], "--solver", "scip", '-u', config['carveme']['gram'], "-o", dir+"\cmpb_out\models\Draft.xml"]) else: - subprocess.run(["carve", config['cm-polish']['protein_fasta'], "--solver", "scip", "-o", dir+"\cmpb_out\models\Draft.xml"]) + subprocess.run(["carve", config['general']['protein_fasta'], "--solver", "scip", "-o", dir+"\cmpb_out\models\Draft.xml"]) config['input']['modelpath'] = dir+'\cmpb_out\models\Draft.xml' current_modelpath = config['input']['modelpath'] @@ -159,10 +161,10 @@ def between_analysis(model: Model, cfg:dict, step:str): if 'CarveMe' in current_libmodel.getAnnotationString(): Path(dir,"cmpb_out",'misc', 'wrong_annotations').mkdir(parents=True, exist_ok=False) current_libmodel = polish(current_libmodel, - email = config['cm-polish']['email'], + email = config['tech-resources']['email'], id_db = config['general']['namespace'], refseq_gff = config['general']['refseq_gff'], - protein_fasta = config['cm-polish']['protein_fasta'], + protein_fasta = config['general']['protein_fasta'], lab_strain = config['cm-polish']['is_lab_strain'], kegg_organism_id = config['general']['kegg_organism_id'], path = Path(dir,'cmpb_out','misc','wrong_annotations')) @@ -188,68 +190,101 @@ def between_analysis(model: Model, cfg:dict, step:str): # gapfilling ############ - # options: automatic/manual extension/manual input - if config['gapfilling']['gap_fill_params']: - - filename = str(Path(dir, 'cmpb_out', 'misc',f'{current_libmodel.getId()}_gap_analysis_{str(today)}')) - gapfilling_perfomed = True - - match config['gapfilling']['gap_fill_params']['db_to_compare']: - - case 'KEGG': - gap_analysis_result, current_libmodel = gapfill( - current_libmodel, 'KEGG', config['general']['refseq_gff'], - config['general']['kegg_organism_id'], None, - filename - ) - case 'BioCyc': - gap_analysis_result, current_libmodel = gapfill( - current_libmodel, 'BioCyc', None, None, config['gapfilling']['gap_fill_params']['biocyc_files'], - filename - ) - case 'KEGG+BioCyc': - gap_analysis_result, current_libmodel = gapfill( - current_libmodel, 'KEGG+BioCyc', config['general']['refseq_gff'], - config['general']['kegg_organism_id'], config['gapfilling']['gap_fill_params']['biocyc_files'], - filename - ) - case 'USER' | None: - logging.info(f'Gapfilling skipped.') - gapfilling_perfomed = False - case _: - logging.warning( - f''' - \'{config["gapfilling"]["gap_fill_params"]["db_to_compare"]}\' is no valid input for the gap analysis. - Please specify for the parameter \'db_to_compare\' one of the following options: - KEGG | BioCyc | KEGG+BioCyc - ''' - ) - - if gapfilling_perfomed: - logging.info(f'Gap analysis for {current_libmodel.getId()} with {config["gapfilling"]["gap_fill_params"]["db_to_compare"]} was performed.') - logging.info(f'Complete Excel table is in file: {filename}.xlsx.') - + run_gapfill = False + # KEGGapFiller + if config['gapfilling']['KEGGapFiller']: + if config['general']['kegg_organism_id']: + run_gapfill = True + # gapfilling with KEGG via KEGG organism ID + kgf = KEGGapFiller(config['general']['kegg_organism_id']) + kgf.find_missing_genes(current_libmodel) + kgf.find_missing_reacs(current_model) + current_libmodel = kgf.fill_model(current_libmodel, + namespace = config['general']['namespace'], + idprefix = config['gapfilling']['idprefix'], + formula_check = config['gapfilling']['formula-check'], + exclude_dna = config['gapfilling']['formula-check'], + exclude_rna = config['gapfilling']['exclude-rna'] + ) # save model if config['general']['save_all_models']: - write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_autofilled_gaps.xml'))) - current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_autofilled_gaps.xml') + write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml'))) + current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml') else: - write_model_to_file(current_libmodel, str(current_modelpath)) - - logging.info(f'Gaps were filled automatically in {current_libmodel.getId()}.') - - if config['gapfilling']['gap_fill_file']: - current_libmodel = gapfill_model(current_libmodel, config['gapfilling']['gap_fill_file']) - + write_model_to_file(current_libmodel, str(only_modelpath)) + current_modelpath = only_modelpath + current_model = load_model(current_modelpath,'cobra') + else: + mes = f'No KEGG organism ID provided. Gapfilling with KEGG will be skipped.' + raise warnings.warn(mes,UserWarning) + + # BioCycGapFiller + if config['gapfilling']['BioCycGapFiller']: + run_gapfill = True + bcgf = BioCycGapFiller(config['gapfilling']['BioCycGapFiller parameters']['gene-table'], + config['gapfilling']['BioCycGapFiller parameters']['reacs-table'], + config['gapfilling']['BioCycGapFiller parameters']['gff'] + ) + # @TEST if it works + bcgf.find_missing_genes(current_libmodel) + bcgf.find_missing_reacs(current_model) + current_libmodel = kgf.fill_model(current_libmodel, + namespace = config['general']['namespace'], + idprefix = config['gapfilling']['idprefix'], + formula_check = config['gapfilling']['formula-check'], + exclude_dna = config['gapfilling']['formula-check'], + exclude_rna = config['gapfilling']['exclude-rna'] + ) # save model if config['general']['save_all_models']: - write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_manually_filled_gaps.xml'))) - current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_manually_filled_gaps.xml') + write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml'))) + current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml') else: - write_model_to_file(current_libmodel, str(current_modelpath)) - - logging.info(f'Gaps were filled based on a manually curated file in {current_libmodel.getId()}.') + write_model_to_file(current_libmodel, str(only_modelpath)) + current_modelpath = only_modelpath + current_model = load_model(current_modelpath,'cobra') + # GeneGapFiller + if config['gapfilling']['GeneGapFiller']: + run_gapfill = True + ggf = GeneGapFiller() + ggf.find_missing_genes(config['gapfilling']['GeneGapFiller parameters']['gff'], + current_libmodel) + ggf.find_missing_reacs(current_model, + mail = config['tech-resources']['email'], + check_NCBI = config['gapfilling']['GeneGapFiller parameters']['check-NCBI'], + fasta = config['general']['protein_fasta'], + dmnd_db = config['gapfilling']['GeneGapFiller parameters']['swissprot-dmnd'], + swissprot_map = config['gapfilling']['GeneGapFiller parameters']['swissprot-mapping'], + outdir = Path(dir,"cmpb_out",'misc', 'gapfill'), # @TODO or would a subfolder be better? + sens = config['gapfilling']['GeneGapFiller parameters']['sensitivity'], + cov = config['gapfilling']['GeneGapFiller parameters']['coverage'], + t = config['tech-resources']['threads'], + pid = config['gapfilling']['GeneGapFiller parameters']['percentage identity'] + ) + current_libmodel = ggf.fill_model(current_libmodel, + namespace = config['general']['namespace'], + idprefix = config['gapfilling']['idprefix'], + formula_check = config['gapfilling']['formula-check'], + exclude_dna = config['gapfilling']['formula-check'], + exclude_rna = config['gapfilling']['exclude-rna'] + ) + # save model + if config['general']['save_all_models'] and run_gapfill: + write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml'))) + current_modelpath = Path(dir,'cmpb_out','models',f'{current_libmodel.getId()}_after_KEGG_gapfill.xml') + current_model = load_model(current_modelpath,'cobra') + elif run_gapfill: + write_model_to_file(current_libmodel, str(only_modelpath)) + current_modelpath = only_modelpath + current_model = load_model(current_modelpath,'cobra') + else: + pass # no gapfill + + # testing + if run_gapfill: + between_growth_test(current_model,config,step='after_gapfill') + between_analysis(current_model, config, step='after_gapfill') # ModelPolisher ############### @@ -275,11 +310,8 @@ def between_analysis(model: Model, cfg:dict, step:str): # SBOannotator # ------------ - # @TODO - # theoretically: something along the way: - #libsbml_doc = readSBML(current_modelpath) - #libsbml_model = libsbml_doc.getModel() - """ current_libmodel = load_model(str(current_modelpath),'libsbml') + # @TEST , if it works + current_libmodel = load_model(str(current_modelpath),'libsbml') if config['general']['save_all_models']: current_libmodel = run_SBOannotator(current_libmodel) write_model_to_file(current_libmodel, str(Path(dir,'cmpb_out','models', 'SBOannotated.xml'))) @@ -290,7 +322,7 @@ def between_analysis(model: Model, cfg:dict, step:str): current_model = load_model(str(current_modelpath),'cobra') between_analysis(current_model,config,step='after_annotation') - """ + # model cleanup ############### diff --git a/src/specimen/data/config/cmpb_config.yaml b/src/specimen/data/config/cmpb_config.yaml index 6bb5921..8e05aec 100644 --- a/src/specimen/data/config/cmpb_config.yaml +++ b/src/specimen/data/config/cmpb_config.yaml @@ -1,4 +1,4 @@ -# Configuration file for the SPECIMEN CMPB pipeline +# Configuration file for the SPECIMEN CMPB workflow # Meaning of the default parameters: # The value __USER__ indicates parameters required to be specified by the user @@ -30,57 +30,85 @@ general: memote_always_on: False # Run MEMOTE after every step stats_always_on: False # Calculate the model statistics after every step - # Below are options used by multiple steps - refseq_gff: USER # Path to RefSeq GFF file: Required for gap analysis with 'KEGG'. + # Options used by multiple steps of the workflow + refseq_gff: USER # Path to RefSeq GFF file: # Can be optionally provided for cm-polish. kegg_organism_id: USER # KEGG ID of the organism: Required for gap analysis with 'KEGG'. # Can be optionally provided for cm-polish. + protein_fasta: USER # Required, if used for CarveMe or GeneGapFiller. Optional + # for cm-polish except for 'is_lab_strain: True'. + # The path to the protein FASTA used to create the CarveMe model. + # For more information, please refer to the documentation. + +tech-resources: + email: USER # User Mail to use for Entrez (accessing NCBI). + threads: 2 # Number of threads available for tools like DIAMOND. # Part-specific options -# --------------------- +# ===================== # Build a model using CarveMe +# --------------------------- carveme: run_carve: False # if CarveMe should be run, # fill out the params below - protein_fasta: USER # FASTA used for CarveMe gram: USER # grampos or gramneg? # Polish a CarveMe model # Only neccessary, if the model will or has been build with CarveMe # Will only be used, if model is indeed a CarveMe model cm-polish: - email: USER # User Mail to use for Entrez - protein_fasta: USER # Optional, except for 'is_lab_strain: True'. - # The path to the protein FASTA used to create the CarveMe model. - # For more information, please refer to the documentation. is_lab_strain: False # Whether the users strain originates from a lab # Needs to be set to ensure that protein IDs get the 'bqbiol:isHomologTo' qualifier # & to set the locus_tag to the ones obtained by the annotation # (Warning: Might cause issues if annotatione was not performed with NCBI PGAP!) # Filling gaps, optional +# ---------------------- gapfilling: - ### Automatic gap filling ### - # All parameters are required for all 'db_to_compare' choices except 'biocyc_files' which is not required for 'KEGG' - gap_fill_params: - db_to_compare: USER # One of the choices KEGG|BioCyc|KEGG+BioCyc - biocyc_files: - - USER # Path to TXT file containing a SmartTable from BioCyc with the columns 'Accession-2' 'Reaction of gene' - - USER # Path to TXT file containing a SmartTable with all reaction relevant information (*) - - USER # Path to TXT file containing a SmartTable with all metabolite relevant information (+) - - USER # Path to protein FASTA file used as input for CarveMe (Required to get the protein IDs from the locus tags) - # (*) 'Reaction' 'Reactants of reaction' 'Products of reaction' 'EC-Number' 'KEGG Reaction' 'MetaNetX' 'Reaction-Direction' 'Spontaneous?' - # (+) 'Compound' 'Object ID' 'Chemical Formula' 'InChI-Key' 'ChEBI' - gap_fill_file: NULL # Path to Excel file with which gaps in model should be filled - # Either obtained by running gapfilling/Created by hand with the same structure as the result file from gapfilling - # Example Excel file to fill in by hand: refinegems/src/refinegems/example/example_inputs/modelName_gapfill_analysis_date_example.xlsx + ########### general options ########### + # parameters, that apply to all the gap filling algorithmns + idprefix: 'CMPB' # prefix to use for fantasy IDs, if IDs for + # the namespace do not exist. + formula-check: 'existence' # When checking, if a metabolite can be added to the model + # also check the formula. For more information about + # available options, please refer to the docs of + # the function isreaction_comlete(). + exclude-dna: True # Exclude reactions containing 'DNA' in their name + # from being added to the model. + exclude-rna: True # Exclude reactions containing 'RNA' in their name + # from being added to the model. + ########## enable algorithms ########## + # via KEGG ................... + # requires KEGG organism ID to be set + KEGGapFiller: False # activate gap filling via GFF + # via BioCyc ................. + BioCycGapFiller: False # Activate gap filling via BioCyc. + BioCycGapFiller parameters: + gene-table: USER # Path to a gene smart table file from BioCyc. + reacs-table: USER # Path to a reactions smart table from BioCyc. + gff: USER # Path to a GFF file of the genome of the model. + # via GFF .................... + GeneGapFiller: False # Activate gap filling via GFF + GeneGapFiller parameters: + gff: USER # Path to a gff file (does not have to be the RefSeq). + # Needs to be from the same genome the model was build on. + swissprot-dmnd: USER # Path to the SwissProt DIAMOND database file. + swissprot-mapping: USER # Path to the SwissProt mapping file (against EC / BRENDA) + check-NCBI: False # Enable checking NCBI accession numbers for EC numbers - time costly. + sensitivity: 'more-sensitiv' # Sensitivity option for the DIAMOND run. + coverage: 90.0 # Coverage (parameter for DIAMOND). + percentage identity: 90.0 # Percentage identity threshold value for accepting + # matches found by DIAMOND as homologous. + # Add KEGG pathways as groups, optional +# ------------------------------------- kegg_pathway_groups: True # Resolve duplicates +# ------------------ duplicates: # Three possible options for the resolvement of duplicates for the following model entities: # - check: Check for duplicates and simply report them @@ -92,6 +120,7 @@ duplicates: remove_unused_metabs: False # BOFdat / Biomass objective function +# ----------------------------------- BOF: run_bofdat: False # if BOFdat should be run, diff --git a/src/specimen/util/set_up.py b/src/specimen/util/set_up.py index 6d3a684..3b8b6af 100644 --- a/src/specimen/util/set_up.py +++ b/src/specimen/util/set_up.py @@ -60,6 +60,7 @@ # setup data (structure) # ---------------------- # @TEST : deleted BiGG part, since its already covered with refinegems +# @DEPRECATE: MNX now coverend in refinegems - change extension after cleaning gapfill module in refinegems def download_mnx(dir:str='MetaNetX', chunk_size:int=1024): """Download the data needed from the MetaNetX database.