Skip to content

Commit

Permalink
Filter reads that STARsolo has marked as chimeric
Browse files Browse the repository at this point in the history
  • Loading branch information
alecw committed Mar 21, 2024
1 parent 9a6a18e commit 45de639
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,7 @@
import org.broadinstitute.dropseqrna.utils.ProgressLoggingIterator;
import org.broadinstitute.dropseqrna.utils.SamWriterSink;
import org.broadinstitute.dropseqrna.utils.StringTagComparator;
import org.broadinstitute.dropseqrna.utils.readiterators.ChromosomeFilteringIterator;
import org.broadinstitute.dropseqrna.utils.readiterators.GeneFunctionIteratorWrapper;
import org.broadinstitute.dropseqrna.utils.readiterators.MapQualityFilteredIterator;
import org.broadinstitute.dropseqrna.utils.readiterators.MissingTagFilteringIterator;
import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator;
import org.broadinstitute.dropseqrna.utils.readiterators.SamRecordSortingIteratorFactory;
import org.broadinstitute.dropseqrna.utils.readiterators.StrandStrategy;
import org.broadinstitute.dropseqrna.utils.readiterators.*;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
Expand Down Expand Up @@ -103,11 +97,12 @@ public SNPUMIBasePileupIterator (final SamHeaderAndIterator headerAndIter, final
final ProgressLogger logger = new ProgressLogger(LOG);

// add a progress logger.
ProgressLoggingIterator loggingIterator = new ProgressLoggingIterator (headerAndIter.iterator, logger);
final ProgressLoggingIterator loggingIterator = new ProgressLoggingIterator (headerAndIter.iterator, logger);
final STARSoloChimericReadFilteringIterator chimericReadFilteringIterator = new STARSoloChimericReadFilteringIterator(loggingIterator, molecularBarcodeTag);

// filter out to contigs in the interval list.
Set<String> contigsToFilter = snpIntervals.getIntervals().stream().map(x -> x.getContig()).collect(Collectors.toSet());
ChromosomeFilteringIterator cfi = new ChromosomeFilteringIterator(loggingIterator, contigsToFilter, false);
ChromosomeFilteringIterator cfi = new ChromosomeFilteringIterator(chimericReadFilteringIterator, contigsToFilter, false);

// Filter reads on map quality
MapQualityFilteredIterator filteringIterator = new MapQualityFilteredIterator(cfi, readMQ, true);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* MIT License
*
* Copyright 2024 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.broadinstitute.dropseqrna.utils.readiterators;

import htsjdk.samtools.SAMRecord;
import org.broadinstitute.dropseqrna.utils.FilteredIterator;

import java.util.Iterator;

/**
* Filter out reads that STARsolo has marked as chimeric.
*/
public class STARSoloChimericReadFilteringIterator
extends FilteredIterator<SAMRecord> {

private final String molecularBarcodeTag;

public STARSoloChimericReadFilteringIterator(Iterator<SAMRecord> underlyingIterator, String molecularBarcodeTag) {
super(underlyingIterator);
this.molecularBarcodeTag = molecularBarcodeTag;
}

@Override
public boolean filterOut(SAMRecord rec) {
final String umi = rec.getStringAttribute(molecularBarcodeTag);
if (umi == null) {
throw new IllegalArgumentException("Read " + rec.getSAMString() + " does not have a UMI tag " + molecularBarcodeTag);
}
return umi.equals(CHIMERIC_UMI);
}

/**
* The UMI value that STARsolo uses to indicate a chimeric read.
*/
public static final String CHIMERIC_UMI = "-";
}
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ public UMIIterator(final SamHeaderAndIterator headerAndIterator,
Iterator<SAMRecord> filteringIterator =
new MissingTagFilteringIterator(headerAndIterator.iterator, cellBarcodeTag, geneTag, molecularBarcodeTag);

// Filter out reads that STARsolo has marked as chimeric
filteringIterator = new STARSoloChimericReadFilteringIterator(filteringIterator, molecularBarcodeTag);

// Filter reads on map quality. Optionally keep non-primary reads at low map quality.
boolean rejectNonPrimaryReads = readMQ>3;
if (!rejectNonPrimaryReads)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* MIT License
*
* Copyright 2024 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.broadinstitute.dropseqrna.utils.readiterators;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;

public class STARSoloChimericReadFilteringIteratorTest {

public final File INPUT = new File("testdata/org/broadinstitute/dropseq/utils/readiterators/STARsolo_chimeric_marked.sam");

@Test
public void testBasic() {
SamReader reader = SamReaderFactory.makeDefault().open(INPUT);
STARSoloChimericReadFilteringIterator iterator = new STARSoloChimericReadFilteringIterator(reader.iterator(), "XM");
int numPassing = 0;
while (iterator.hasNext()) {
++numPassing;
final SAMRecord rec = iterator.next();
Assert.assertEquals(rec.getReadName(), "not_chimeric");
}
Assert.assertEquals(numPassing, 1);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@HD VN:1.5 SO:coordinate
@SQ SN:1 LN:249250621 M5:1b22b98cdeb4a9304cb5d48026a85128 UR:file:/broad/mccarroll/software/metadata/individual_reference/hg19/hg19.fasta
@RG ID:00000.1 SM:N701 LB:N701 PL:illumina PU:000000000-AMY9M.1.TAAGGCGA CN:BI DT:2016-02-02T00:00:00-0500
not_chimeric 0 1 564475 3 50M * 0 0 CTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATT AAAB3FF5DF4BEGFGGFGGFGHCHGHGGCEEGEG?FEGEFHHGGGGEHH XC:Z:CGCCTCCTCCGA MD:Z:50 XF:Z:INTERGENIC PG:Z:STAR RG:Z:00000.1 XG:Z:RP5-857K21.4 NH:i:2 NM:i:0 XM:Z:TTTCTGTG UQ:i:0 AS:i:49 gf:Z:INTRONIC gn:Z:RP5-857K21.4 gs:Z:-
chimeric 0 1 564477 3 50M * 0 0 AGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCT BAABCFFFFFFFGGGGGGGGFGHGFHGGGGGGGGGGGHHHHGGGGHHHHF XC:Z:GACATGATAGAC MD:Z:50 XF:Z:INTERGENIC PG:Z:STAR RG:Z:00000.1 XG:Z:RP5-857K21.4 NH:i:2 NM:i:0 XM:Z:- UQ:i:0 AS:i:49 gf:Z:INTRONIC gn:Z:RP5-857K21.4 gs:Z:-

0 comments on commit 45de639

Please sign in to comment.