diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SNPUMIBasePileupIterator.java b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SNPUMIBasePileupIterator.java index 2d840abf..601e52b9 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SNPUMIBasePileupIterator.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SNPUMIBasePileupIterator.java @@ -38,13 +38,7 @@ import org.broadinstitute.dropseqrna.utils.ProgressLoggingIterator; import org.broadinstitute.dropseqrna.utils.SamWriterSink; import org.broadinstitute.dropseqrna.utils.StringTagComparator; -import org.broadinstitute.dropseqrna.utils.readiterators.ChromosomeFilteringIterator; -import org.broadinstitute.dropseqrna.utils.readiterators.GeneFunctionIteratorWrapper; -import org.broadinstitute.dropseqrna.utils.readiterators.MapQualityFilteredIterator; -import org.broadinstitute.dropseqrna.utils.readiterators.MissingTagFilteringIterator; -import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator; -import org.broadinstitute.dropseqrna.utils.readiterators.SamRecordSortingIteratorFactory; -import org.broadinstitute.dropseqrna.utils.readiterators.StrandStrategy; +import org.broadinstitute.dropseqrna.utils.readiterators.*; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceDictionary; @@ -103,11 +97,12 @@ public SNPUMIBasePileupIterator (final SamHeaderAndIterator headerAndIter, final final ProgressLogger logger = new ProgressLogger(LOG); // add a progress logger. - ProgressLoggingIterator loggingIterator = new ProgressLoggingIterator (headerAndIter.iterator, logger); + final ProgressLoggingIterator loggingIterator = new ProgressLoggingIterator (headerAndIter.iterator, logger); + final STARSoloChimericReadFilteringIterator chimericReadFilteringIterator = new STARSoloChimericReadFilteringIterator(loggingIterator, molecularBarcodeTag); // filter out to contigs in the interval list. Set contigsToFilter = snpIntervals.getIntervals().stream().map(x -> x.getContig()).collect(Collectors.toSet()); - ChromosomeFilteringIterator cfi = new ChromosomeFilteringIterator(loggingIterator, contigsToFilter, false); + ChromosomeFilteringIterator cfi = new ChromosomeFilteringIterator(chimericReadFilteringIterator, contigsToFilter, false); // Filter reads on map quality MapQualityFilteredIterator filteringIterator = new MapQualityFilteredIterator(cfi, readMQ, true); diff --git a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/STARSoloChimericReadFilteringIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/STARSoloChimericReadFilteringIterator.java new file mode 100644 index 00000000..833bffe5 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/STARSoloChimericReadFilteringIterator.java @@ -0,0 +1,57 @@ +/* + * MIT License + * + * Copyright 2024 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.utils.readiterators; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.dropseqrna.utils.FilteredIterator; + +import java.util.Iterator; + +/** + * Filter out reads that STARsolo has marked as chimeric. + */ +public class STARSoloChimericReadFilteringIterator + extends FilteredIterator { + + private final String molecularBarcodeTag; + + public STARSoloChimericReadFilteringIterator(Iterator underlyingIterator, String molecularBarcodeTag) { + super(underlyingIterator); + this.molecularBarcodeTag = molecularBarcodeTag; + } + + @Override + public boolean filterOut(SAMRecord rec) { + final String umi = rec.getStringAttribute(molecularBarcodeTag); + if (umi == null) { + throw new IllegalArgumentException("Read " + rec.getSAMString() + " does not have a UMI tag " + molecularBarcodeTag); + } + return umi.equals(CHIMERIC_UMI); + } + + /** + * The UMI value that STARsolo uses to indicate a chimeric read. + */ + public static final String CHIMERIC_UMI = "-"; +} diff --git a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java index 3821c085..bc95580c 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java @@ -166,6 +166,9 @@ public UMIIterator(final SamHeaderAndIterator headerAndIterator, Iterator filteringIterator = new MissingTagFilteringIterator(headerAndIterator.iterator, cellBarcodeTag, geneTag, molecularBarcodeTag); + // Filter out reads that STARsolo has marked as chimeric + filteringIterator = new STARSoloChimericReadFilteringIterator(filteringIterator, molecularBarcodeTag); + // Filter reads on map quality. Optionally keep non-primary reads at low map quality. boolean rejectNonPrimaryReads = readMQ>3; if (!rejectNonPrimaryReads) diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/STARSoloChimericReadFilteringIteratorTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/STARSoloChimericReadFilteringIteratorTest.java new file mode 100644 index 00000000..4886e9f5 --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/STARSoloChimericReadFilteringIteratorTest.java @@ -0,0 +1,50 @@ +/* + * MIT License + * + * Copyright 2024 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.utils.readiterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; + +public class STARSoloChimericReadFilteringIteratorTest { + + public final File INPUT = new File("testdata/org/broadinstitute/dropseq/utils/readiterators/STARsolo_chimeric_marked.sam"); + + @Test + public void testBasic() { + SamReader reader = SamReaderFactory.makeDefault().open(INPUT); + STARSoloChimericReadFilteringIterator iterator = new STARSoloChimericReadFilteringIterator(reader.iterator(), "XM"); + int numPassing = 0; + while (iterator.hasNext()) { + ++numPassing; + final SAMRecord rec = iterator.next(); + Assert.assertEquals(rec.getReadName(), "not_chimeric"); + } + Assert.assertEquals(numPassing, 1); + } +} diff --git a/testdata/org/broadinstitute/dropseq/utils/readiterators/STARsolo_chimeric_marked.sam b/testdata/org/broadinstitute/dropseq/utils/readiterators/STARsolo_chimeric_marked.sam new file mode 100644 index 00000000..4581c4e9 --- /dev/null +++ b/testdata/org/broadinstitute/dropseq/utils/readiterators/STARsolo_chimeric_marked.sam @@ -0,0 +1,5 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:1 LN:249250621 M5:1b22b98cdeb4a9304cb5d48026a85128 UR:file:/broad/mccarroll/software/metadata/individual_reference/hg19/hg19.fasta +@RG ID:00000.1 SM:N701 LB:N701 PL:illumina PU:000000000-AMY9M.1.TAAGGCGA CN:BI DT:2016-02-02T00:00:00-0500 +not_chimeric 0 1 564475 3 50M * 0 0 CTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATT AAAB3FF5DF4BEGFGGFGGFGHCHGHGGCEEGEG?FEGEFHHGGGGEHH XC:Z:CGCCTCCTCCGA MD:Z:50 XF:Z:INTERGENIC PG:Z:STAR RG:Z:00000.1 XG:Z:RP5-857K21.4 NH:i:2 NM:i:0 XM:Z:TTTCTGTG UQ:i:0 AS:i:49 gf:Z:INTRONIC gn:Z:RP5-857K21.4 gs:Z:- +chimeric 0 1 564477 3 50M * 0 0 AGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCT BAABCFFFFFFFGGGGGGGGFGHGFHGGGGGGGGGGGHHHHGGGGHHHHF XC:Z:GACATGATAGAC MD:Z:50 XF:Z:INTERGENIC PG:Z:STAR RG:Z:00000.1 XG:Z:RP5-857K21.4 NH:i:2 NM:i:0 XM:Z:- UQ:i:0 AS:i:49 gf:Z:INTRONIC gn:Z:RP5-857K21.4 gs:Z:-