enforce ft with rel abundances

adamovanja · May 1, 2024 · 105c9d9 · 105c9d9
1 parent 551d020
commit 105c9d9
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 46 deletions.
diff --git a/q2_ritme/process_data.py b/q2_ritme/process_data.py
@@ -1,3 +1,4 @@
+import biom
 import pandas as pd
 import qiime2 as q2
 from sklearn.model_selection import GroupShuffleSplit
@@ -6,6 +7,43 @@
 from q2_ritme.simulate_data import simulate_data
 
 
+def get_relative_abundance(
+    ft: pd.DataFrame, feature_prefix: str = "", no_features: list = []
+) -> pd.DataFrame:
+    """
+    Transform feature table from absolute to relative abundance. Only columns in
+    feature_prefix are transformed. If feature_prefix is not set, then all
+    features except no_features are transformed.
+    """
+    if feature_prefix != "":
+        ft_cols = [x for x in ft.columns if x.startswith(feature_prefix)]
+    elif len(no_features) > 0:
+        ft_cols = [x for x in ft.columns if x not in no_features]
+    else:
+        raise ValueError("Either feature_prefix or no_features must be set")
+    ft_sel = ft[ft_cols]
+    # Convert the pandas DataFrame to a biom.Table object
+    ft_biom = biom.Table(
+        ft_sel.T.values, observation_ids=ft_sel.columns, sample_ids=ft_sel.index
+    )
+
+    # Normalize the feature table using biom.Table.norm()
+    ft_rel_biom = ft_biom.norm(axis="sample", inplace=False)
+
+    # Convert the normalized biom.Table back to a pandas DataFrame
+    ft_rel = pd.DataFrame(
+        ft_rel_biom.matrix_data.toarray().T,
+        index=ft_rel_biom.ids(axis="sample"),
+        columns=ft_rel_biom.ids(axis="observation"),
+    )
+
+    print(ft_rel.head())
+    # round needed as certain 1.0 are represented in different digits 2e-16
+    assert ft_rel[ft_cols].sum(axis=1).round(5).eq(1.0).all()
+
+    return ft_rel
+
+
 def load_data(
     path2md: str = None, path2ft: str = None, target: str = None
 ) -> (pd.DataFrame, pd.DataFrame):
@@ -14,7 +52,8 @@ def load_data(
 
     Args:
         path2md (str): Path to metadata file. If None, simulated data is used.
-        path2ft (str): Path to features file. If None, simulated data is used.
+        path2ft (str): Path to features file - must be relative abundances. If
+        None, simulated data is used.
 
     Returns:
         tuple: A tuple containing two pandas DataFrames, first for features,
@@ -27,14 +66,23 @@ def load_data(
             ft = pd.read_csv(path2ft, sep="\t", index_col=0)
         elif path2ft.endswith(".qza"):
             ft = q2.Artifact.load(path2ft).view(pd.DataFrame)
-        # todo: assert that loaded ft has relative abundances
+
         # flag microbial features with prefix "F"
         first_letter = set([i[0] for i in ft.columns.tolist()])
         if first_letter != {"F"}:
             ft.columns = [f"F{i}" for i in ft.columns.tolist()]
     else:
         ft, md = simulate_data(1000, target)
 
+    # assert that loaded ft has relative abundances
+    relative_abundances = ft[ft.columns].sum(axis=1).round(5).eq(1.0).all()
+    if not relative_abundances:
+        print(
+            "Feature columns do not sum to 1.0 for all samples - so they are being "
+            "transformed."
+        )
+        ft = get_relative_abundance(ft, "F")
+
     return ft, md
 
 

diff --git a/q2_ritme/tests/test_process_data.py b/q2_ritme/tests/test_process_data.py
@@ -8,6 +8,7 @@
 
 from q2_ritme.process_data import (
     filter_merge_n_sort,
+    get_relative_abundance,
     load_data,
     load_n_split_data,
     split_data_by_host,
@@ -19,43 +20,84 @@ class TestProcessData(TestPluginBase):
 
     def setUp(self):
         super().setUp()
-        self.data = pd.DataFrame(
+        self.data_abs = pd.DataFrame(
             {
                 "host_id": ["c", "b", "c", "a"],
-                "F0": [0.12, 0.23, 0.33, 0.44],
-                "F1": [0.1, 0.2, 0.3, 0.4],
+                "F0": [100, 10, 3, 5],
+                "F1": [20, 30, 0, 0],
+                "X1": [50, 50, 30, 5],
+                "X2": [30, 40, 1, 1],
                 "supertarget": [1, 2, 5, 7],
                 "covariate": [0, 1, 0, 1],
             }
         )
-        self.data.index = ["SR1", "SR2", "SR3", "SR4"]
-        self.md = self.data[["host_id", "supertarget", "covariate"]]
-        self.ft = self.data[["F0", "F1"]]
+        self.data_rel = pd.DataFrame(
+            {
+                "host_id": ["c", "b", "c", "a"],
+                "F0": [0.83333333333333333, 0.25, 1.0, 1.0],
+                "F1": [0.16666666666666666, 0.75, 0.0, 0.0],
+                "supertarget": [1, 2, 5, 7],
+                "covariate": [0, 1, 0, 1],
+            }
+        )
+        self.data_abs.index = ["SR1", "SR2", "SR3", "SR4"]
+        self.data_rel.index = ["SR1", "SR2", "SR3", "SR4"]
+        self.md = self.data_rel[["host_id", "supertarget", "covariate"]]
+        self.ft_abs = self.data_abs[["F0", "F1"]]
+        self.ft_rel = self.data_rel[["F0", "F1"]]
 
         self.tmpdir = tempfile.TemporaryDirectory()
         self.tmp_md_path = os.path.join(self.tmpdir.name, "test_md.tsv")
-        self.tmp_ft_path = os.path.join(self.tmpdir.name, "test_ft.tsv")
         self.md.to_csv(self.tmp_md_path, sep="\t")
-        self.ft.to_csv(self.tmp_ft_path, sep="\t")
+
+        self.tmp_ft_rel_path = os.path.join(self.tmpdir.name, "test_ft_rel.tsv")
+        self.ft_rel.to_csv(self.tmp_ft_rel_path, sep="\t")
+        self.tmp_ft_abs_path = os.path.join(self.tmpdir.name, "test_ft_abs.tsv")
+        self.ft_abs.to_csv(self.tmp_ft_abs_path, sep="\t")
 
     def tearDown(self):
         self.tmpdir.cleanup()
 
-    def test_load_data_ft_tsv(self):
-        # Load the data from the temporary files
-        ft, md = load_data(self.tmp_md_path, self.tmp_ft_path)
+    def test_get_relative_abundance(self):
+        ft_rel_obs = get_relative_abundance(self.ft_abs, feature_prefix="F")
+
+        pd.testing.assert_frame_equal(ft_rel_obs, self.ft_rel)
+
+    def test_get_relative_abundance_no_features(self):
+        # Call the get_relative_abundance function with no_features
+        ft_rel_obs = get_relative_abundance(self.ft_abs, no_features=["X1", "X2"])
 
-        pd.testing.assert_frame_equal(ft, self.ft)
+        # Compare the observed and expected DataFrames
+        pd.testing.assert_frame_equal(ft_rel_obs, self.ft_rel)
+
+    def test_get_relative_abundance_no_features_prefix(self):
+        # Test the case when neither feature_prefix nor no_features is provided
+        with self.assertRaises(ValueError):
+            get_relative_abundance(self.ft_abs)
+
+    def test_load_data_ft_rel_tsv(self):
+        # Load the relative feature abundance table
+        ft_rel, md = load_data(self.tmp_md_path, self.tmp_ft_rel_path)
+
+        pd.testing.assert_frame_equal(ft_rel, self.ft_rel)
         pd.testing.assert_frame_equal(md, self.md)
 
-    def test_load_data_ft_qza(self):
-        art_ft = q2.Artifact.import_data("FeatureTable[Frequency]", self.ft)
-        tmp_ft_path_art = self.tmp_ft_path.replace(".tsv", ".qza")
-        art_ft.save(tmp_ft_path_art)
+    def test_load_data_ft_abs_tsv(self):
+        # Load the absolute feature abundance table -> transformed to relative
+        # abundances
+        ft, md = load_data(self.tmp_md_path, self.tmp_ft_abs_path)
+
+        pd.testing.assert_frame_equal(ft, self.ft_rel)
+        pd.testing.assert_frame_equal(md, self.md)
+
+    def test_load_data_ft_abs_qza(self):
+        art_ft = q2.Artifact.import_data("FeatureTable[Frequency]", self.ft_abs)
+        tmp_ft_abs_path_art = self.tmp_ft_abs_path.replace(".tsv", ".qza")
+        art_ft.save(tmp_ft_abs_path_art)
         # Load the data from the temporary files
-        ft, md = load_data(self.tmp_md_path, tmp_ft_path_art)
+        ft, md = load_data(self.tmp_md_path, tmp_ft_abs_path_art)
 
-        pd.testing.assert_frame_equal(ft, self.ft)
+        pd.testing.assert_frame_equal(ft, self.ft_rel)
         pd.testing.assert_frame_equal(md, self.md)
 
     def test_load_data_simulated(self):
@@ -64,8 +106,8 @@ def test_load_data_simulated(self):
         assert md.shape[0] == 1000
 
     def test_load_data_no_feature_prefix(self):
-        tmp_ft_path_noprefix = self.tmp_ft_path.replace(".tsv", "_noprefix.tsv")
-        ft_noprefix = self.ft.rename(columns={"F0": "0", "F1": "1"})
+        tmp_ft_path_noprefix = self.tmp_ft_rel_path.replace(".tsv", "_noprefix.tsv")
+        ft_noprefix = self.ft_rel.rename(columns={"F0": "0", "F1": "1"})
         ft_noprefix.to_csv(tmp_ft_path_noprefix, sep="\t")
 
         ft, _ = load_data(self.tmp_md_path, tmp_ft_path_noprefix)
@@ -74,50 +116,37 @@ def test_load_data_no_feature_prefix(self):
     def test_filter_merge_n_sort_w_filter(self):
         obs = filter_merge_n_sort(
             self.md,
-            self.ft,
+            self.ft_rel,
             host_id="host_id",
             target="supertarget",
             filter_md_cols=["host_id", "supertarget"],
         )
 
-        exp = pd.DataFrame(
-            {
-                "host_id": ["a", "b", "c", "c"],
-                "supertarget": [7, 2, 1, 5],
-                "F0": [0.44, 0.23, 0.12, 0.33],
-                "F1": [0.4, 0.2, 0.1, 0.3],
-            },
-            index=["SR4", "SR2", "SR1", "SR3"],
+        exp = self.data_rel.drop("covariate", axis=1).sort_values(
+            ["host_id", "supertarget"]
         )
+        exp = exp[["host_id", "supertarget", "F0", "F1"]]
 
         pd.testing.assert_frame_equal(obs, exp)
 
     def test_filter_merge_n_sort_no_filter(self):
         obs = filter_merge_n_sort(
             self.md,
-            self.ft,
+            self.ft_rel,
             host_id="host_id",
             target="supertarget",
         )
 
-        exp = pd.DataFrame(
-            {
-                "host_id": ["a", "b", "c", "c"],
-                "supertarget": [7, 2, 1, 5],
-                "covariate": [1, 1, 0, 0],
-                "F0": [0.44, 0.23, 0.12, 0.33],
-                "F1": [0.4, 0.2, 0.1, 0.3],
-            },
-            index=["SR4", "SR2", "SR1", "SR3"],
-        )
+        exp = self.data_rel.sort_values(["host_id", "supertarget"])
+        exp = exp[["host_id", "supertarget", "covariate", "F0", "F1"]]
 
         pd.testing.assert_frame_equal(obs, exp)
 
     def test_split_data_by_host(self):
-        train_obs, test_obs = split_data_by_host(self.data, "host_id", 0.5, 123)
+        train_obs, test_obs = split_data_by_host(self.data_rel, "host_id", 0.5, 123)
 
-        train_exp = self.data.iloc[[0, 2], :].copy()
-        test_exp = self.data.iloc[[1, 3], :].copy()
+        train_exp = self.data_rel.iloc[[0, 2], :].copy()
+        test_exp = self.data_rel.iloc[[1, 3], :].copy()
 
         assert_frame_equal(train_obs, train_exp)
         assert_frame_equal(test_obs, test_exp)
@@ -142,7 +171,7 @@ def test_load_n_split_data(self):
         # Call the function with the test paths
         train_val, test = load_n_split_data(
             self.tmp_md_path,
-            self.tmp_ft_path,
+            self.tmp_ft_rel_path,
             host_id="host_id",
             target="supertarget",
             train_size=0.8,