Skip to content

Commit

Permalink
Support multiple BAMs and/or bam_list(s) as input to FilterBam
Browse files Browse the repository at this point in the history
  • Loading branch information
alecw committed May 7, 2024
1 parent 93a0242 commit 8da9d28
Show file tree
Hide file tree
Showing 5 changed files with 1,059 additions and 16 deletions.
25 changes: 12 additions & 13 deletions src/java/org/broadinstitute/dropseqrna/utils/FilterBam.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import htsjdk.samtools.util.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.dropseqrna.barnyard.DigitalExpression;
Expand All @@ -50,10 +51,8 @@
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil;
import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;

Expand All @@ -69,8 +68,8 @@
public class FilterBam extends CommandLineProgram{
private final Log log = Log.getInstance(FilterBam.class);

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to analyze.")
public File INPUT;
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM, BAM or bam_list file to analyze.")
public List<File> INPUT;

@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output report")
public File OUTPUT;
Expand Down Expand Up @@ -143,22 +142,22 @@ public enum MatchTypes {

@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
INPUT = FileListParsingUtils.expandFileList(INPUT);
INPUT.forEach(IOUtil::assertFileIsReadable);
IOUtil.assertFileIsWritable(OUTPUT);
buildPatterns();
SamHeaderAndIterator samHeaderAndIterator = SamFileMergeUtil.mergeInputs(INPUT, true);

SamReader in = SamReaderFactory.makeDefault().open(INPUT);

SAMFileHeader fileHeader = editSequenceDictionary(in.getFileHeader().clone());
SAMFileHeader fileHeader = editSequenceDictionary(samHeaderAndIterator.header.clone());
SamHeaderUtil.addPgRecord(fileHeader, this);
SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(fileHeader, true, OUTPUT);
ProgressLogger progLog=new ProgressLogger(log);

final boolean sequencesRemoved = fileHeader.getSequenceDictionary().getSequences().size() != in.getFileHeader().getSequenceDictionary().getSequences().size();
final boolean sequencesRemoved = fileHeader.getSequenceDictionary().getSequences().size() != samHeaderAndIterator.header.getSequenceDictionary().getSequences().size();

FilteredReadsMetric m = new FilteredReadsMetric();

for (final SAMRecord r : in) {
for (final SAMRecord r : new IterableAdapter<SAMRecord>(samHeaderAndIterator.iterator)) {
progLog.record(r);
if (!filterRead(r)) {
String sequenceName = stripReferencePrefix(r.getReferenceName());
Expand Down Expand Up @@ -193,7 +192,7 @@ protected int doWork() {
}
// write the summary if the summary file is not null.
writeSummary(this.SUMMARY, m);
CloserUtil.close(in);
CloserUtil.close(samHeaderAndIterator.iterator);
out.close();
FilterProgramUtils.reportAndCheckFilterResults("reads", m.READS_ACCEPTED, m.READS_REJECTED,
PASSING_READ_THRESHOLD, log);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.zip.DeflaterFactory;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
Expand All @@ -38,6 +39,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

// Note that there is some weirdness with IntelDeflater on Mac OS that sometimes causes SEGV. This test therefore
Expand All @@ -47,7 +49,13 @@ public class FilterBamTest {

// TO GENERATE COUNTS OF CONTIGS FOR HUMAN/MOUSE...
// samtools view human_mouse_smaller.bam |cut -f 3 |sort |uniq -c > human_mouse_smaller.contig_counts.txt
private static File ORGANISM_INPUT_FILE=new File ("testdata/org/broadinstitute/dropseq/utils/human_mouse_smaller.bam");
private static File TEST_DATA_DIR = new File("testdata/org/broadinstitute/dropseq/utils");
private static File ORGANISM_INPUT_FILE=new File (TEST_DATA_DIR,"human_mouse_smaller.bam");
private static File BAM_LIST = new File(TEST_DATA_DIR, "human_mouse_x.bam_list");
private static List<File> BAM_FILES = Arrays.asList(
new File(TEST_DATA_DIR, "human_mouse_x.1.sam"),
new File(TEST_DATA_DIR, "human_mouse_x.2.sam")
);

private static final long NUM_HUMAN_READS = 136859;
public static final long NUM_MOUSE_READS = 76036;
Expand All @@ -56,7 +64,7 @@ public class FilterBamTest {
@Test
public void testOrganismFiltering () throws IOException {
FilterBam f = new FilterBam();
f.INPUT=ORGANISM_INPUT_FILE;
f.INPUT= Collections.singletonList(ORGANISM_INPUT_FILE);
f.SUMMARY=File.createTempFile("paired_input", ".summary.txt");
f.OUTPUT=File.createTempFile("paired_input", ".bam");
f.OUTPUT.deleteOnExit();
Expand Down Expand Up @@ -90,7 +98,7 @@ public void testFailingThreshold(final boolean fractionalThreshold) throws IOExc
}
private void filterWithThreshold(final double threshold) throws IOException {
FilterBam f = new FilterBam();
f.INPUT=ORGANISM_INPUT_FILE;
f.INPUT=Collections.singletonList(ORGANISM_INPUT_FILE);;
f.OUTPUT=File.createTempFile("paired_input", ".bam");
f.OUTPUT.deleteOnExit();
f.REF_SOFT_MATCHED_RETAINED=Arrays.asList("HUMAN");
Expand Down Expand Up @@ -402,6 +410,40 @@ public Object[][] testEndToEndDataProvider() {
return ret.toArray(new Object[ret.size()][]);
}

@Test(dataProvider = "testMultipleBamInputDataProvider")
public void testMultipleBamInput(final List<File> inputBamsOrBamList) throws IOException {
FilterBam f = new FilterBam();
f.INPUT= inputBamsOrBamList;
f.SUMMARY=TestUtils.getTempReportFile("human_mouse.x", ".summary.txt");
f.OUTPUT=TestUtils.getTempReportFile("paired_input", ".bam");
f.REF_SOFT_MATCHED_RETAINED=Arrays.asList("HUMAN");
// need to set this to an empty list, as that's what the constructor for FilterBam would do.
f.STRIP_REF_PREFIX=new ArrayList<>();
Assert.assertEquals(f.doWork(), 0);
// Just count the reads in the input files because we know no reads are filtered out
int numReads = BAM_FILES.stream().mapToInt(this::countSamRecords).sum();
List<FilteredReadsMetric> metrics = MetricsFile.readBeans(f.SUMMARY);
Assert.assertEquals(numReads, metrics.get(0).READS_ACCEPTED);
}
@DataProvider(name="testMultipleBamInputDataProvider")
public Object[][] testMultipleBamInputDataProvider() {
return new Object[][] {
{BAM_FILES},
{Collections.singletonList(BAM_LIST)}
};
}

private int countSamRecords(final File f) {
SamReaderFactory factory = SamReaderFactory.makeDefault();
SamReader reader = factory.open(f);
int count=0;
for (SAMRecord r: reader) {
count++;
}
CloserUtil.close(reader);
return count;
}

private DeflaterFactory deflaterFactory;
@BeforeMethod
public void setDeflaterFactory() {
Expand Down
Loading

0 comments on commit 8da9d28

Please sign in to comment.