Skip to content

Commit

Permalink
eliminate subparser in dropseq_aggregation clp (#474)
Browse files Browse the repository at this point in the history
* rename package stuff I missed

* Eliminate subparser stuff
  • Loading branch information
alecw authored Oct 17, 2024
1 parent 5ad8d58 commit 277d7b7
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 101 deletions.
6 changes: 3 additions & 3 deletions src/python/dropseq_aggregation/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# Simple python tools for managing tabular cell metadata text files..
# Simple python tools for managing tabular cell metadata text files.

## Installation

Requires python >= 3.8
```
pip install 'git+https://github.com/broadinstitute/Drop-seq.git#egg=dropseq_metadata&subdirectory=src/python/dropseq_metadata'
pip install 'git+https://github.com/broadinstitute/Drop-seq.git#egg=dropseq_aggregation&subdirectory=src/python/dropseq_aggregation'
```

## Usage

Run `dropseq_metadata -h` for usage information.
Run `join_and_filter_tsv -h` for usage information.

5 changes: 3 additions & 2 deletions src/python/dropseq_aggregation/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "dropseq_metadata"
name = "dropseq_aggregation"
version = "3.0.2"
dependencies = [
"pandas==2.2.3"
Expand All @@ -24,4 +24,5 @@ Homepage = "https://github.com/broadinstitute/Drop-seq/"
Issues = "https://github.com/broadinstitute/Drop-seq/issues"

[project.scripts]
dropseq_metadata = "dropseq_metadata.cli:main"
join_and_filter_tsv = "dropseq_aggregation.join_and_filter_tsv:main"

22 changes: 22 additions & 0 deletions src/python/dropseq_aggregation/src/dropseq_aggregation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,25 @@
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import logging

# I cannot believe I need to do this to cause logger to write to stderr.
logging.basicConfig(
level=logging.INFO, # Set the logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()] # StreamHandler writes to sys.stderr by default
)
logger = logging.getLogger(__name__)

dctLogLevel = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL
}

def add_log_argument(parser):
parser.add_argument("--log-level", "-l", default="INFO", choices=dctLogLevel.keys(),
help="Set the logging level. (default: %(default)s)")
75 changes: 0 additions & 75 deletions src/python/dropseq_aggregation/src/dropseq_aggregation/cli.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,10 @@

import argparse
import sys

import pandas as pd
from pandas.errors import MergeError
try:
from . import cli
except ImportError:
import cli
from . import logger, add_log_argument

DELETEME_COLUMN_SUFFIX = '_deleteme'

Expand Down Expand Up @@ -77,8 +75,9 @@ def try_convert_string(s):
def load_values_file(file):
return pd.read_csv(file, sep='\t', header=None).iloc[0]

def add_subparser(subparsers):
parser = subparsers.add_parser("join_and_filter_tsv", description=__doc__)
def parse_args(args):
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
add_log_argument(parser)
parser.add_argument("--input", "-i", type=argparse.FileType('r'),
help="Primary tab-separated file to join. Default: %(default)s", default=sys.stdin)
parser.add_argument("--output", "-o", type=argparse.FileType('w'),
Expand All @@ -101,8 +100,12 @@ def add_subparser(subparsers):
help="Filter out rows where COLUMN is not one of the given VALUEs. May be specified multiple times.")
parser.add_argument("--exclude", nargs='+', action='append', default=[], metavar=('COLUMN', 'VALUE'),
help="Filter out rows where COLUMN is one of the given VALUEs. May be specified multiple times.")
return parser.parse_args(args)

def main(args=None):
run(parse_args(args))

def main(options):
def run(options):
# load the primary file
primary = pd.read_csv(options.input, sep='\t')
options.input.close()
Expand All @@ -115,7 +118,7 @@ def main(options):
primary = primary.merge(secondary, how='left', left_on=input_col, right_on=join_col, validate="many_to_one",
suffixes=(None, DELETEME_COLUMN_SUFFIX))
except MergeError as e:
cli.logger.error(f"Error joining {join_file} on {input_col} and {join_col}: {e}")
logger.error(f"Error joining {join_file} on {input_col} and {join_col}: {e}")
return 1
if not join_col_in_left:
# drop the join column from the merged data frame
Expand Down Expand Up @@ -153,4 +156,5 @@ def main(options):
options.output.close()
return 0


if __name__ == "__main__":
sys.exit(main())
24 changes: 12 additions & 12 deletions src/python/dropseq_aggregation/tests/test_join_and_filter_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_basic(self):
secondary = os.path.join(self.testDataDir, "sample1.100.scPred.txt")
options = self.options._replace(input=open(primary),
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE")])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
self.assertSharedColumnsEqual(self.outputFile, primary)
self.assertSharedColumnsEqual(self.outputFile, secondary)

Expand All @@ -62,7 +62,7 @@ def test_fewer_secondary(self):
secondary = os.path.join(self.testDataDir, "sample1.50.scPred.txt")
options = self.options._replace(input=open(primary),
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE")])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
self.assertSharedColumnsEqual(self.outputFile, primary)
self.assertSharedColumnsEqual(self.outputFile, secondary, wideRows=49)

Expand All @@ -71,7 +71,7 @@ def test_fewer_primary(self):
secondary = os.path.join(self.testDataDir, "sample1.100.scPred.txt")
options = self.options._replace(input=open(primary),
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE")])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
self.assertSharedColumnsEqual(self.outputFile, primary)
self.assertSharedColumnsEqual(self.outputFile, secondary, narrowRows=49)

Expand All @@ -82,7 +82,7 @@ def test_additional_join(self):
options = self.options._replace(input=open(primary),
join=[(secondary1, "CELL_BARCODE", "CELL_BARCODE"),
(secondary2, "DONOR", "DONOR")])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
self.assertSharedColumnsEqual(self.outputFile, primary)
self.assertSharedColumnsEqual(self.outputFile, secondary1)
self.assertMultiJoin(self.outputFile, secondary2, "DONOR", "DONOR")
Expand All @@ -94,7 +94,7 @@ def test_set(self):
options = self.options._replace(input=open(primary),
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE")],
set=setTuples)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
outputDf = pd.read_csv(self.outputFile, sep='\t')
for column, value in setTuples:
self.assertTrue((outputDf[column] == value).all())
Expand All @@ -107,7 +107,7 @@ def test_min(self):
options = self.options._replace(input=open(primary),
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE")],
min=[("max.prob", "0.8")])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
outputDf = pd.read_csv(self.outputFile, sep='\t')
self.assertTrue((outputDf["max.prob"] >= 0.8).all())
primaryDf = pd.read_csv(primary, sep='\t')
Expand All @@ -119,7 +119,7 @@ def test_max(self):
options = self.options._replace(input=open(primary),
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE")],
max=[("max.prob", "0.8")])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
outputDf = pd.read_csv(self.outputFile, sep='\t')
self.assertTrue((outputDf["max.prob"] <= 0.8).all())
primaryDf = pd.read_csv(primary, sep='\t')
Expand All @@ -130,7 +130,7 @@ def test_include_file(self):
includeFile = os.path.join(self.testDataDir, "donor_subset.txt")
options = self.options._replace(input=open(primary),
include_file=[("DONOR", includeFile)])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
outputDf = pd.read_csv(self.outputFile, sep='\t')
includeValues = pd.read_csv(includeFile, sep='\t', header=None).iloc[0]
self.assertTrue((outputDf["DONOR"].isin(includeValues)).all())
Expand All @@ -140,7 +140,7 @@ def test_exclude_file(self):
excludeFile = os.path.join(self.testDataDir, "donor_subset.txt")
options = self.options._replace(input=open(primary),
exclude_file=[("DONOR", excludeFile)])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
outputDf = pd.read_csv(self.outputFile, sep='\t')
excludeValues = pd.read_csv(excludeFile, sep='\t', header=None).iloc[0]
self.assertFalse((outputDf["DONOR"].isin(excludeValues)).any())
Expand All @@ -152,7 +152,7 @@ def test_include_exclude(self):
options = self.options._replace(input=open(primary),
include=[["DONOR"] + donorsToInclude],
exclude=[["predClass"] + predClassesToExclude])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
outputDf = pd.read_csv(self.outputFile, sep='\t')
self.assertTrue((outputDf["DONOR"].isin(donorsToInclude)).all())
self.assertFalse((outputDf["predClass"].isin(predClassesToExclude)).any())
Expand All @@ -162,13 +162,13 @@ def test_negative_non_unique_join(self):
secondary = os.path.join(self.testDataDir, "sample1.nonunique.scPred.txt")
options = self.options._replace(input=open(primary),
join=[(secondary, "CELL_BARCODE", "CELL_BARCODE")])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 1)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 1)

def test_boolean(self):
primary = os.path.join(self.testDataDir, "sample1.100.cell_metadata.txt")
options = self.options._replace(input=open(primary),
exclude=[["doublet", "true"]])
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.main(options), 0)
self.assertEqual(dropseq_aggregation.join_and_filter_tsv.run(options), 0)
outputDf = pd.read_csv(self.outputFile, sep='\t')
self.assertFalse(outputDf["doublet"].any())

Expand Down

0 comments on commit 277d7b7

Please sign in to comment.