Skip to content

Commit

Permalink
Update #99
Browse files Browse the repository at this point in the history
  • Loading branch information
cb-Hades committed Feb 19, 2024
1 parent d5edc2e commit 4b18364
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 15 deletions.
15 changes: 3 additions & 12 deletions src/refinegems/classes/medium.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,20 +549,20 @@ def export_to_cobra(self, namespace:Literal['Name', 'BiGG']='BiGG', default_flux
# functions for loading from DB
############################################################################

def load_substance_table_from_db(mediumname: str, database:str, type='standard') -> pd.DataFrame:
def load_substance_table_from_db(mediumname: str, database:str,
type:Literal['testing','standard']='standard') -> pd.DataFrame:
"""Load a substance table from a database.
Currently available types:
- 'testing': for debugging
- 'documentation': for downloading a table for the docs
- 'standard': The standard format containing all information in long format.
Note: 'documentation' currently object to change
Args:
name (str): The name (or identifier) of the medium.
database (str): Path to the database.
type (str, optional): How to load the table. Defaults to 'standard'.
type (Literal['testing','standard'], optional): How to load the table. Defaults to 'standard'.
Raises:
ValueError: Unknown type for loading the substance table.
Expand All @@ -585,15 +585,6 @@ def load_substance_table_from_db(mediumname: str, database:str, type='standard')
substance_table = result.fetchall()
substance_table = pd.DataFrame(substance_table, columns=['id','name','formula','flux'])

# create table for documentation
case 'documentation':
result = cursor.execute("""SELECT substance.name, substance.formula, medium2substance.flux , medium2substance.source, substance2db.db_id, substance2db.db_type
FROM medium, medium2substance, substance, substance2db
WHERE medium.name = ? AND medium.id = medium2substance.medium_id AND medium2substance.substance_id = substance.id AND substance2db.substance_id = substance.id
""", (mediumname,))
substance_table = result.fetchall()
substance_table = pd.DataFrame(substance_table, columns=['name','formula','flux','source','db_id','db_type'])

# create table with all information, standard for generating the Medium table
case 'standard':
result = cursor.execute("""SELECT substance.name, substance.formula, medium2substance.flux , medium2substance.source, substance2db.db_id, substance2db.db_type
Expand Down
122 changes: 119 additions & 3 deletions src/refinegems/curation/polish.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,14 @@
The newer version of CarveMe leads to some irritations in the model, these scripts enable for example the addition of BiGG Ids to the annotations as well as a correct formatting of the annotations.
"""

__author__ = "Famke Baeuerle and Gwendolyn O. Döbel and Carolin Brune"

################################################################################
# requirements
################################################################################

import re, logging
import cobra
from bioregistry import is_valid_identifier, manager, normalize_parsed_curie, get_identifiers_org_iri
import pandas as pd
from bioservices.kegg import KEGG
Expand All @@ -21,12 +28,20 @@
from colorama import Fore, Style
from datetime import date

__author__ = "Famke Baeuerle and Gwendolyn O. Döbel"

################################################################################
# variables
################################################################################

#------------------------------------------------ Constant variables --------------------------------------------------#
BIOCYC_TIER1_DATABASES_PREFIXES = ['META', 'ECO', 'ECOLI', 'HUMAN']

################################################################################
# functions
################################################################################

# ------------------
# Polish - Main part
# ------------------

#----------- Functions to add URIs from the entity IDs to the annotation field for metabolites & reactions ------------#
def add_metab(entity_list: list[Species], id_db: str):
Expand Down Expand Up @@ -1250,4 +1265,105 @@ def polish(model: libModel, email: str, id_db: str, refseq_gff: str,
print('Changing all qualifiers to be MIRIAM compliant:')
change_all_qualifiers(model, lab_strain)

return model
return model



# ----------------------
# Directionality Control
# ----------------------

# @TODO
# @NOTE add a control to check if chaning the reaction direction leads to EGC or so
# or is this unnessesary here
def check_direction(model:cobra.Model,data:pd.DataFrame|str) -> cobra.Model:
"""Check the direction of reactions by searching for matching MetaCyc,
KEGG and MetaNetX IDs as well as EC number in a downloaded BioCyc (MetaCyc)
database table or dataFrame (need to contain at least the following columns:
Reactions (MetaCyc ID),EC-Number,KEGG reaction,METANETX,Reaction-Direction.
Args:
model (cobra.Model): The model loaded with COBRApy.
data (pd.DataFrame | str): Either a pandas DataFrame or a path to a CSV file
containing the BioCyc smart table.
Raises:
TypeError: Unknown data type for parameter data
Returns:
cobra.Model: The edited model.
"""

match data:
# aleady a DataFrame
case pd.DataFrame():
pass
case str():
# load from a table
data = pd.read_csv(data, sep='\t')
# rewrite the columns into a better comparable/searchable format
data['KEGG reaction'] = data['KEGG reaction'].str.extract('.*>(R\d*)<.*')
data['METANETX'] = data['METANETX'].str.extract('.*>(MNXR\d*)<.*')
data['EC-Number'] = data['EC-Number'].str.extract('EC-(.*)')
case _:
mes = f'Unknown data type for parameter data: {type(data)}'
raise TypeError(mes)

# check direction
# --------------------
for r in model.reactions:

direction = None
# easy case: metacyc is already (corretly) annotated
if 'metacyc.reaction' in r.annotation and len(data[data['Reactions'] == r.annotation['metacyc.reaction']]) != 0:
direction = data[data['Reactions'] == r.annotation['metacyc.reaction']]['Reaction-Direction'].iloc[0]
r.notes['BioCyc direction check'] = F'found {direction}'
# complicated case: no metacyc annotation
else:
annotations = []

# collect matches
if 'kegg.reaction' in r.annotation and r.annotation['kegg.reaction'] in data['KEGG reaction'].tolist():
annotations.append(data[data['KEGG reaction'] == r.annotation['kegg.reaction']]['Reactions'].tolist())
if 'metanetx.reaction' in r.annotation and r.annotation['metanetx.reaction'] in data['METANETX'].tolist():
annotations.append(data[data['METANETX'] == r.annotation['metanetx.reaction']]['Reactions'].tolist())
if 'ec-code' in r.annotation and r.annotation['ec-code'] in data['EC-Number'].tolist():
annotations.append(data[data['EC-Number'] == r.annotation['ec-code']]['Reactions'].tolist())

# check results
# no matches
if len(annotations) == 0:
r.notes['BioCyc direction check'] = 'not found'

# matches found
else:
# built intersection
intersec = set(annotations[0]).intersection(*annotations)
# case 1: exactly one match remains
if len(intersec) == 1:
entry = intersec.pop()
direction = data[data['Reactions'] == entry]['Reaction-Direction'].iloc[0]
r.annotation['metacyc.reaction'] = entry
r.notes['BioCyc direction check'] = F'found {direction}'

# case 2: multiple matches found -> inconclusive
else:
r.notes['BioCyc direction check'] = F'found, but inconclusive'

# update direction if possible and needed
if not pd.isnull(direction):
if 'REVERSIBLE' in direction:
# set reaction as reversible by setting default values for upper and lower bounds
r.lower_bound = -1000.
elif 'RIGHT-TO-LEFT' in direction:
# invert the default values for the boundaries
r.lower_bound = -1000.
r.upper_bound = 0.
else:
# left to right case is the standart for adding reactions
# = nothing left to do
continue

return model


0 comments on commit 4b18364

Please sign in to comment.