Skip to content

Commit

Permalink
enforce ft with rel abundances
Browse files Browse the repository at this point in the history
  • Loading branch information
adamovanja committed May 1, 2024
1 parent 551d020 commit 105c9d9
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 46 deletions.
52 changes: 50 additions & 2 deletions q2_ritme/process_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import biom
import pandas as pd
import qiime2 as q2
from sklearn.model_selection import GroupShuffleSplit
Expand All @@ -6,6 +7,43 @@
from q2_ritme.simulate_data import simulate_data


def get_relative_abundance(
ft: pd.DataFrame, feature_prefix: str = "", no_features: list = []
) -> pd.DataFrame:
"""
Transform feature table from absolute to relative abundance. Only columns in
feature_prefix are transformed. If feature_prefix is not set, then all
features except no_features are transformed.
"""
if feature_prefix != "":
ft_cols = [x for x in ft.columns if x.startswith(feature_prefix)]
elif len(no_features) > 0:
ft_cols = [x for x in ft.columns if x not in no_features]
else:
raise ValueError("Either feature_prefix or no_features must be set")
ft_sel = ft[ft_cols]
# Convert the pandas DataFrame to a biom.Table object
ft_biom = biom.Table(
ft_sel.T.values, observation_ids=ft_sel.columns, sample_ids=ft_sel.index
)

# Normalize the feature table using biom.Table.norm()
ft_rel_biom = ft_biom.norm(axis="sample", inplace=False)

# Convert the normalized biom.Table back to a pandas DataFrame
ft_rel = pd.DataFrame(
ft_rel_biom.matrix_data.toarray().T,
index=ft_rel_biom.ids(axis="sample"),
columns=ft_rel_biom.ids(axis="observation"),
)

print(ft_rel.head())
# round needed as certain 1.0 are represented in different digits 2e-16
assert ft_rel[ft_cols].sum(axis=1).round(5).eq(1.0).all()

return ft_rel


def load_data(
path2md: str = None, path2ft: str = None, target: str = None
) -> (pd.DataFrame, pd.DataFrame):
Expand All @@ -14,7 +52,8 @@ def load_data(
Args:
path2md (str): Path to metadata file. If None, simulated data is used.
path2ft (str): Path to features file. If None, simulated data is used.
path2ft (str): Path to features file - must be relative abundances. If
None, simulated data is used.
Returns:
tuple: A tuple containing two pandas DataFrames, first for features,
Expand All @@ -27,14 +66,23 @@ def load_data(
ft = pd.read_csv(path2ft, sep="\t", index_col=0)
elif path2ft.endswith(".qza"):
ft = q2.Artifact.load(path2ft).view(pd.DataFrame)
# todo: assert that loaded ft has relative abundances

# flag microbial features with prefix "F"
first_letter = set([i[0] for i in ft.columns.tolist()])
if first_letter != {"F"}:
ft.columns = [f"F{i}" for i in ft.columns.tolist()]
else:
ft, md = simulate_data(1000, target)

# assert that loaded ft has relative abundances
relative_abundances = ft[ft.columns].sum(axis=1).round(5).eq(1.0).all()
if not relative_abundances:
print(
"Feature columns do not sum to 1.0 for all samples - so they are being "
"transformed."
)
ft = get_relative_abundance(ft, "F")

return ft, md


Expand Down
117 changes: 73 additions & 44 deletions q2_ritme/tests/test_process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from q2_ritme.process_data import (
filter_merge_n_sort,
get_relative_abundance,
load_data,
load_n_split_data,
split_data_by_host,
Expand All @@ -19,43 +20,84 @@ class TestProcessData(TestPluginBase):

def setUp(self):
super().setUp()
self.data = pd.DataFrame(
self.data_abs = pd.DataFrame(
{
"host_id": ["c", "b", "c", "a"],
"F0": [0.12, 0.23, 0.33, 0.44],
"F1": [0.1, 0.2, 0.3, 0.4],
"F0": [100, 10, 3, 5],
"F1": [20, 30, 0, 0],
"X1": [50, 50, 30, 5],
"X2": [30, 40, 1, 1],
"supertarget": [1, 2, 5, 7],
"covariate": [0, 1, 0, 1],
}
)
self.data.index = ["SR1", "SR2", "SR3", "SR4"]
self.md = self.data[["host_id", "supertarget", "covariate"]]
self.ft = self.data[["F0", "F1"]]
self.data_rel = pd.DataFrame(
{
"host_id": ["c", "b", "c", "a"],
"F0": [0.83333333333333333, 0.25, 1.0, 1.0],
"F1": [0.16666666666666666, 0.75, 0.0, 0.0],
"supertarget": [1, 2, 5, 7],
"covariate": [0, 1, 0, 1],
}
)
self.data_abs.index = ["SR1", "SR2", "SR3", "SR4"]
self.data_rel.index = ["SR1", "SR2", "SR3", "SR4"]
self.md = self.data_rel[["host_id", "supertarget", "covariate"]]
self.ft_abs = self.data_abs[["F0", "F1"]]
self.ft_rel = self.data_rel[["F0", "F1"]]

self.tmpdir = tempfile.TemporaryDirectory()
self.tmp_md_path = os.path.join(self.tmpdir.name, "test_md.tsv")
self.tmp_ft_path = os.path.join(self.tmpdir.name, "test_ft.tsv")
self.md.to_csv(self.tmp_md_path, sep="\t")
self.ft.to_csv(self.tmp_ft_path, sep="\t")

self.tmp_ft_rel_path = os.path.join(self.tmpdir.name, "test_ft_rel.tsv")
self.ft_rel.to_csv(self.tmp_ft_rel_path, sep="\t")
self.tmp_ft_abs_path = os.path.join(self.tmpdir.name, "test_ft_abs.tsv")
self.ft_abs.to_csv(self.tmp_ft_abs_path, sep="\t")

def tearDown(self):
self.tmpdir.cleanup()

def test_load_data_ft_tsv(self):
# Load the data from the temporary files
ft, md = load_data(self.tmp_md_path, self.tmp_ft_path)
def test_get_relative_abundance(self):
ft_rel_obs = get_relative_abundance(self.ft_abs, feature_prefix="F")

pd.testing.assert_frame_equal(ft_rel_obs, self.ft_rel)

def test_get_relative_abundance_no_features(self):
# Call the get_relative_abundance function with no_features
ft_rel_obs = get_relative_abundance(self.ft_abs, no_features=["X1", "X2"])

pd.testing.assert_frame_equal(ft, self.ft)
# Compare the observed and expected DataFrames
pd.testing.assert_frame_equal(ft_rel_obs, self.ft_rel)

def test_get_relative_abundance_no_features_prefix(self):
# Test the case when neither feature_prefix nor no_features is provided
with self.assertRaises(ValueError):
get_relative_abundance(self.ft_abs)

def test_load_data_ft_rel_tsv(self):
# Load the relative feature abundance table
ft_rel, md = load_data(self.tmp_md_path, self.tmp_ft_rel_path)

pd.testing.assert_frame_equal(ft_rel, self.ft_rel)
pd.testing.assert_frame_equal(md, self.md)

def test_load_data_ft_qza(self):
art_ft = q2.Artifact.import_data("FeatureTable[Frequency]", self.ft)
tmp_ft_path_art = self.tmp_ft_path.replace(".tsv", ".qza")
art_ft.save(tmp_ft_path_art)
def test_load_data_ft_abs_tsv(self):
# Load the absolute feature abundance table -> transformed to relative
# abundances
ft, md = load_data(self.tmp_md_path, self.tmp_ft_abs_path)

pd.testing.assert_frame_equal(ft, self.ft_rel)
pd.testing.assert_frame_equal(md, self.md)

def test_load_data_ft_abs_qza(self):
art_ft = q2.Artifact.import_data("FeatureTable[Frequency]", self.ft_abs)
tmp_ft_abs_path_art = self.tmp_ft_abs_path.replace(".tsv", ".qza")
art_ft.save(tmp_ft_abs_path_art)
# Load the data from the temporary files
ft, md = load_data(self.tmp_md_path, tmp_ft_path_art)
ft, md = load_data(self.tmp_md_path, tmp_ft_abs_path_art)

pd.testing.assert_frame_equal(ft, self.ft)
pd.testing.assert_frame_equal(ft, self.ft_rel)
pd.testing.assert_frame_equal(md, self.md)

def test_load_data_simulated(self):
Expand All @@ -64,8 +106,8 @@ def test_load_data_simulated(self):
assert md.shape[0] == 1000

def test_load_data_no_feature_prefix(self):
tmp_ft_path_noprefix = self.tmp_ft_path.replace(".tsv", "_noprefix.tsv")
ft_noprefix = self.ft.rename(columns={"F0": "0", "F1": "1"})
tmp_ft_path_noprefix = self.tmp_ft_rel_path.replace(".tsv", "_noprefix.tsv")
ft_noprefix = self.ft_rel.rename(columns={"F0": "0", "F1": "1"})
ft_noprefix.to_csv(tmp_ft_path_noprefix, sep="\t")

ft, _ = load_data(self.tmp_md_path, tmp_ft_path_noprefix)
Expand All @@ -74,50 +116,37 @@ def test_load_data_no_feature_prefix(self):
def test_filter_merge_n_sort_w_filter(self):
obs = filter_merge_n_sort(
self.md,
self.ft,
self.ft_rel,
host_id="host_id",
target="supertarget",
filter_md_cols=["host_id", "supertarget"],
)

exp = pd.DataFrame(
{
"host_id": ["a", "b", "c", "c"],
"supertarget": [7, 2, 1, 5],
"F0": [0.44, 0.23, 0.12, 0.33],
"F1": [0.4, 0.2, 0.1, 0.3],
},
index=["SR4", "SR2", "SR1", "SR3"],
exp = self.data_rel.drop("covariate", axis=1).sort_values(
["host_id", "supertarget"]
)
exp = exp[["host_id", "supertarget", "F0", "F1"]]

pd.testing.assert_frame_equal(obs, exp)

def test_filter_merge_n_sort_no_filter(self):
obs = filter_merge_n_sort(
self.md,
self.ft,
self.ft_rel,
host_id="host_id",
target="supertarget",
)

exp = pd.DataFrame(
{
"host_id": ["a", "b", "c", "c"],
"supertarget": [7, 2, 1, 5],
"covariate": [1, 1, 0, 0],
"F0": [0.44, 0.23, 0.12, 0.33],
"F1": [0.4, 0.2, 0.1, 0.3],
},
index=["SR4", "SR2", "SR1", "SR3"],
)
exp = self.data_rel.sort_values(["host_id", "supertarget"])
exp = exp[["host_id", "supertarget", "covariate", "F0", "F1"]]

pd.testing.assert_frame_equal(obs, exp)

def test_split_data_by_host(self):
train_obs, test_obs = split_data_by_host(self.data, "host_id", 0.5, 123)
train_obs, test_obs = split_data_by_host(self.data_rel, "host_id", 0.5, 123)

train_exp = self.data.iloc[[0, 2], :].copy()
test_exp = self.data.iloc[[1, 3], :].copy()
train_exp = self.data_rel.iloc[[0, 2], :].copy()
test_exp = self.data_rel.iloc[[1, 3], :].copy()

assert_frame_equal(train_obs, train_exp)
assert_frame_equal(test_obs, test_exp)
Expand All @@ -142,7 +171,7 @@ def test_load_n_split_data(self):
# Call the function with the test paths
train_val, test = load_n_split_data(
self.tmp_md_path,
self.tmp_ft_path,
self.tmp_ft_rel_path,
host_id="host_id",
target="supertarget",
train_size=0.8,
Expand Down

0 comments on commit 105c9d9

Please sign in to comment.