Script to concatenate TSVs

broadinstitute · Oct 23, 2024 · 24fd2d0 · 24fd2d0
1 parent 8c6071b
commit 24fd2d0
Show file tree

Hide file tree

Showing 5 changed files with 333 additions and 0 deletions.
diff --git a/src/python/dropseq_aggregation/pyproject.toml b/src/python/dropseq_aggregation/pyproject.toml
@@ -25,3 +25,4 @@ Issues = "https://github.com/broadinstitute/Drop-seq/issues"
 
 [project.scripts]
 join_and_filter_tsv = "dropseq_aggregation.join_and_filter_tsv:main"
+cat_tsvs = "dropseq_aggregation.cat_tsvs:main"
diff --git a/src/python/dropseq_aggregation/src/dropseq_aggregation/cat_tsvs.py b/src/python/dropseq_aggregation/src/dropseq_aggregation/cat_tsvs.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# MIT License
+# 
+# Copyright 2024 Broad Institute
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Read one or more tab-separated files with header, and write a single tab-separated file with rows concatentated from the
+input files.  The output columns will be the union of all the input columns, with empty values for input files that
+lack a column.  """
+
+import argparse
+import sys
+import pandas as pd
+
+from . import logger, add_log_argument
+
+def main(args=None):
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_log_argument(parser)
+    parser.add_argument("--index-col", "-i", default=None, action="append",
+                        help="Column to use as index.  May be specified more than once.  If indices are not unique, "
+                             "an error will be raised.")
+    parser.add_argument("--output", "-o", default=sys.stdout, type=argparse.FileType('w'),
+                        help="Output file.  Default: stdout.")
+    parser.add_argument("input", nargs="+", type=argparse.FileType('r'),
+                        help="Input tab-separated files.  May be specified more than once.")
+    options = parser.parse_args(args)
+    return run(options)
+
+def run(options):
+    dfs = [pd.read_csv(f, sep="\t") for f in options.input]
+    map(lambda f: f.close(), options.input)
+    df = pd.concat(dfs)
+    if options.index_col:
+        # Check for duplicate keys
+        duplicate_keys = df[df.duplicated(subset=options.index_col, keep=False)]
+
+        if not duplicate_keys.empty:
+            logger.error(f"Duplicate keys found: {duplicate_keys[options.index_col]}")
+            return 1
+
+    df.to_csv(options.output, sep="\t", index=False)
+    options.output.close()
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/python/dropseq_aggregation/tests/test_cat_tsvs.py b/src/python/dropseq_aggregation/tests/test_cat_tsvs.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# MIT License
+# 
+# Copyright 2024 Broad Institute
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import collections
+import os
+import shutil
+import tempfile
+import unittest
+import pandas as pd
+import dropseq_aggregation.cat_tsvs
+
+OptionsTuple = collections.namedtuple("OptionsTuple", ["output", "input", "index_col"],
+                                      defaults=[None, None])
+
+class TestCatTsvs(unittest.TestCase):
+    def setUp(self):
+        self.testDataDir = "../../../testdata/python/dropseq_aggregation/cat_tsvs"
+        self.tmpDir = tempfile.mkdtemp(".tmp", "cat_tsvs.")
+        self.outputFile = os.path.join(self.tmpDir, "output.tsv")
+        self.options = OptionsTuple(open(self.outputFile, "w"))
+        self.inputs = [os.path.join(self.testDataDir, f"rxn{i+1}.joined_filtered_cell_metadata.tsv") for i in range(2)]
+        self.index_cols = ["PREFIX", "CELL_BARCODE"]
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpDir)
+
+    def test_basic(self):
+        options = self.options._replace(index_col=self.index_cols, input=[open(f) for f in self.inputs])
+        self.assertEqual(dropseq_aggregation.cat_tsvs.run(options), 0)
+        outDf = pd.read_csv(self.outputFile, sep="\t")
+        inDfs = [pd.read_csv(f, sep="\t") for f in self.inputs]
+        self.assertEqual(len(outDf), sum(len(df) for df in inDfs))
+        self.assertEqual(set(outDf.columns), set.union(*[set(df.columns) for df in inDfs]))
+
+    def test_duplicate_keys(self):
+        options = self.options._replace(index_col=self.index_cols, input=[open(self.inputs[0]), open(self.inputs[0])])
+        self.assertNotEqual(dropseq_aggregation.cat_tsvs.run(options), 0)
+
+    def test_fewer_columns(self):
+        dfToClip = pd.read_csv(self.inputs[0], sep="\t", index_col=self.index_cols)
+        dfToClip = dfToClip.drop(columns="doublet", axis=1)
+        clippedFile = os.path.join(self.tmpDir, "clipped.tsv")
+        dfToClip.to_csv(clippedFile, sep="\t")
+        inputs = [clippedFile, self.inputs[1]]
+        options = self.options._replace(index_col=self.index_cols, input=[open(f) for f in inputs])
+        self.assertEqual(dropseq_aggregation.cat_tsvs.run(options), 0)
+        outDf = pd.read_csv(self.outputFile, sep="\t")
+        inDfs = [pd.read_csv(f, sep="\t") for f in inputs]
+        self.assertEqual(len(outDf), sum(len(df) for df in inDfs))
+        self.assertEqual(set(outDf.columns), set.union(*[set(df.columns) for df in inDfs]))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,3 +25,4 @@ Issues = "https://github.com/broadinstitute/Drop-seq/issues"

		[project.scripts]
		join_and_filter_tsv = "dropseq_aggregation.join_and_filter_tsv:main"
		cat_tsvs = "dropseq_aggregation.cat_tsvs:main"