Skip to content

Commit

Permalink
Script to concatenate TSVs
Browse files Browse the repository at this point in the history
  • Loading branch information
alecw committed Oct 23, 2024
1 parent 8c6071b commit 24fd2d0
Show file tree
Hide file tree
Showing 5 changed files with 333 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/python/dropseq_aggregation/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ Issues = "https://github.com/broadinstitute/Drop-seq/issues"

[project.scripts]
join_and_filter_tsv = "dropseq_aggregation.join_and_filter_tsv:main"
cat_tsvs = "dropseq_aggregation.cat_tsvs:main"
63 changes: 63 additions & 0 deletions src/python/dropseq_aggregation/src/dropseq_aggregation/cat_tsvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3
# MIT License
#
# Copyright 2024 Broad Institute
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Read one or more tab-separated files with header, and write a single tab-separated file with rows concatentated from the
input files. The output columns will be the union of all the input columns, with empty values for input files that
lack a column. """

import argparse
import sys
import pandas as pd

from . import logger, add_log_argument

def main(args=None):
parser = argparse.ArgumentParser(description=__doc__)
add_log_argument(parser)
parser.add_argument("--index-col", "-i", default=None, action="append",
help="Column to use as index. May be specified more than once. If indices are not unique, "
"an error will be raised.")
parser.add_argument("--output", "-o", default=sys.stdout, type=argparse.FileType('w'),
help="Output file. Default: stdout.")
parser.add_argument("input", nargs="+", type=argparse.FileType('r'),
help="Input tab-separated files. May be specified more than once.")
options = parser.parse_args(args)
return run(options)

def run(options):
dfs = [pd.read_csv(f, sep="\t") for f in options.input]
map(lambda f: f.close(), options.input)
df = pd.concat(dfs)
if options.index_col:
# Check for duplicate keys
duplicate_keys = df[df.duplicated(subset=options.index_col, keep=False)]

if not duplicate_keys.empty:
logger.error(f"Duplicate keys found: {duplicate_keys[options.index_col]}")
return 1

df.to_csv(options.output, sep="\t", index=False)
options.output.close()
return 0

if __name__ == "__main__":
sys.exit(main())
69 changes: 69 additions & 0 deletions src/python/dropseq_aggregation/tests/test_cat_tsvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
# MIT License
#
# Copyright 2024 Broad Institute
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import collections
import os
import shutil
import tempfile
import unittest
import pandas as pd
import dropseq_aggregation.cat_tsvs

OptionsTuple = collections.namedtuple("OptionsTuple", ["output", "input", "index_col"],
defaults=[None, None])

class TestCatTsvs(unittest.TestCase):
def setUp(self):
self.testDataDir = "../../../testdata/python/dropseq_aggregation/cat_tsvs"
self.tmpDir = tempfile.mkdtemp(".tmp", "cat_tsvs.")
self.outputFile = os.path.join(self.tmpDir, "output.tsv")
self.options = OptionsTuple(open(self.outputFile, "w"))
self.inputs = [os.path.join(self.testDataDir, f"rxn{i+1}.joined_filtered_cell_metadata.tsv") for i in range(2)]
self.index_cols = ["PREFIX", "CELL_BARCODE"]

def tearDown(self):
shutil.rmtree(self.tmpDir)

def test_basic(self):
options = self.options._replace(index_col=self.index_cols, input=[open(f) for f in self.inputs])
self.assertEqual(dropseq_aggregation.cat_tsvs.run(options), 0)
outDf = pd.read_csv(self.outputFile, sep="\t")
inDfs = [pd.read_csv(f, sep="\t") for f in self.inputs]
self.assertEqual(len(outDf), sum(len(df) for df in inDfs))
self.assertEqual(set(outDf.columns), set.union(*[set(df.columns) for df in inDfs]))

def test_duplicate_keys(self):
options = self.options._replace(index_col=self.index_cols, input=[open(self.inputs[0]), open(self.inputs[0])])
self.assertNotEqual(dropseq_aggregation.cat_tsvs.run(options), 0)

def test_fewer_columns(self):
dfToClip = pd.read_csv(self.inputs[0], sep="\t", index_col=self.index_cols)
dfToClip = dfToClip.drop(columns="doublet", axis=1)
clippedFile = os.path.join(self.tmpDir, "clipped.tsv")
dfToClip.to_csv(clippedFile, sep="\t")
inputs = [clippedFile, self.inputs[1]]
options = self.options._replace(index_col=self.index_cols, input=[open(f) for f in inputs])
self.assertEqual(dropseq_aggregation.cat_tsvs.run(options), 0)
outDf = pd.read_csv(self.outputFile, sep="\t")
inDfs = [pd.read_csv(f, sep="\t") for f in inputs]
self.assertEqual(len(outDf), sum(len(df) for df in inDfs))
self.assertEqual(set(outDf.columns), set.union(*[set(df.columns) for df in inDfs]))
Loading

0 comments on commit 24fd2d0

Please sign in to comment.