Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New features to support longer read 1 #485

Merged
merged 5 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public enum SequenceBaseEnum {
T('T'),
N('N');

private final char base;
public final char base;

SequenceBaseEnum(char b) {
this.base=b;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ public class CorrectAndSplitScrnaReadPairs
@Argument(doc="Tag to store the corrected barcode on the non-barcode read.")
public String BARCODE_TAG = "XC";

@Argument(doc="If true, assign BARCODE_TAG (also RAW_BARCODE_TAG and BARCODE_QUALS_TAG, if set) to both reads.")
public boolean TAG_BOTH_READS = false;

@Argument(shortName = StandardOptionDefinitions.METRICS_FILE_SHORT_NAME, optional = true,
doc="Various matching and correction metrics")
public File METRICS;
Expand Down Expand Up @@ -116,13 +119,24 @@ protected void splitBAMs() {
final SAMRecord readWithBarcode = BARCODED_READ == 1? pair.getFirstRead(): pair.getSecondRead();
final SAMRecord otherRead = BARCODED_READ == 2? pair.getFirstRead(): pair.getSecondRead();
if (RAW_BARCODE_TAG != null) {
otherRead.setAttribute(RAW_BARCODE_TAG, BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString()));
final String rawBarcode = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getReadString());
otherRead.setAttribute(RAW_BARCODE_TAG, rawBarcode);
if (TAG_BOTH_READS) {
readWithBarcode.setAttribute(RAW_BARCODE_TAG, rawBarcode);
}
}
if (BARCODE_QUALS_TAG != null) {
otherRead.setAttribute(BARCODE_QUALS_TAG, BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getBaseQualityString()));
final String barcodeQuals = BaseRange.getSequenceForBaseRange(baseRanges, readWithBarcode.getBaseQualityString());
otherRead.setAttribute(BARCODE_QUALS_TAG, barcodeQuals);
if (TAG_BOTH_READS) {
readWithBarcode.setAttribute(BARCODE_QUALS_TAG, barcodeQuals);
}
}
final String cellBarcode = getCorrectedCellBarcode(readWithBarcode);
otherRead.setAttribute(BARCODE_TAG, cellBarcode);
if (TAG_BOTH_READS) {
readWithBarcode.setAttribute(BARCODE_TAG, cellBarcode);
}
final int writerIdx = getWriterIdxForCellBarcode(cellBarcode);
writeRecord(writerIdx, pair.getFirstRead());
writeRecord(writerIdx, pair.getSecondRead());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* MIT License
*
* Copyright 2025 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.broadinstitute.dropseqrna.readtrimming;

import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Histogram;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.dropseqrna.cmdline.CustomCommandLineValidationHelper;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;

public abstract class AbstractTrimmerClp extends CommandLineProgram {
static final int UNPAIRED = 0;
static final int FIRST_OF_PAIR = 1;
static final int SECOND_OF_PAIR = 2;
protected static final Set<Integer> VALID_WHICH_READ = CollectionUtil.makeSet(UNPAIRED, FIRST_OF_PAIR, SECOND_OF_PAIR);
@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to analyze.")
public File INPUT;
@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output BAM file")
public File OUTPUT;
@Argument(doc = "The output summary statistics", optional = true)
public File OUTPUT_SUMMARY;
@Argument(doc = "Which reads to trim. 0: unpaired reads; 1: first of pair; 2: second of pair")
public List<Integer> WHICH_READ = new ArrayList<>(Arrays.asList(0));
protected int readsTrimmed = 0;
protected int numReadsTotal = 0;
protected final Histogram<Integer> numBasesTrimmed = new Histogram<>();

protected boolean shouldTrim(final SAMRecord r) {
if (WHICH_READ.contains(UNPAIRED) && r.getReadPairedFlag() == false) {
return true;
}
if (WHICH_READ.contains(FIRST_OF_PAIR) && r.getFirstOfPairFlag()) {
return true;
}
if (WHICH_READ.contains(SECOND_OF_PAIR) && r.getSecondOfPairFlag()) {
return true;
}
return false;
}

@Override
protected String[] customCommandLineValidation() {
final ArrayList<String> list = new ArrayList<>(1);
if (!VALID_WHICH_READ.containsAll(WHICH_READ)) {
list.add("WHICH_READ must be one of " + VALID_WHICH_READ);
}
if (WHICH_READ.isEmpty()) {
list.add("WHICH_READ must be specified");
}
return CustomCommandLineValidationHelper.makeValue(super.customCommandLineValidation(), list);
}
}
102 changes: 102 additions & 0 deletions src/java/org/broadinstitute/dropseqrna/readtrimming/ClipReads.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* MIT License
*
* Copyright 2025 Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package org.broadinstitute.dropseqrna.readtrimming;

import htsjdk.samtools.*;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.dropseqrna.cmdline.DropSeq;
import org.broadinstitute.dropseqrna.utils.BaseRange;
import org.broadinstitute.dropseqrna.utils.SamHeaderUtil;

import java.util.Collections;
import java.util.List;

@CommandLineProgramProperties(summary = "Trim the given base range(s) from all reads of the requested paired-end mode",
oneLineSummary = "Trim base ranges from reads",
programGroup = DropSeq.class)
public class ClipReads extends AbstractTrimmerClp {
private final Log log = Log.getInstance(ClipReads.class);
@Argument(doc="Base range to remove, separated by a dash. E.g 1-4. Can remove multiple ranges by separating them " +
"with a colon. For example 1-4:17-22 removes the first 4 bases, then the 17-22 bases.")
public String BASE_RANGE;

@Override
protected int doWork() {
final ProgressLogger progress = new ProgressLogger(log);
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
SamReader inputSam = SamReaderFactory.makeDefault().open(INPUT);
SAMFileHeader h= inputSam.getFileHeader();
SamHeaderUtil.addPgRecord(h, this);
SAMFileWriter writer= new SAMFileWriterFactory().makeSAMOrBAMWriter(h, true, OUTPUT);
List<BaseRange> baseRanges = BaseRange.parseBaseRange(this.BASE_RANGE);
Collections.sort(baseRanges);
validateBaseRanges(baseRanges);
for (SAMRecord r: inputSam) {
if (shouldTrim(r)) {
clip(r, baseRanges);
this.readsTrimmed++;
}
writer.addAlignment(r);
progress.record(r);
this.numReadsTotal++;
}
CloserUtil.close(inputSam);

writer.close();
log.info("Total reads: " + this.numReadsTotal, " Reads clipped: " + this.readsTrimmed);

return 0;
}

private void clip(SAMRecord r, List<BaseRange> baseRanges) {
if (baseRanges.getLast().getEnd() > r.getReadLength()) {
throw new IllegalArgumentException("Base range end is greater than read length: " + BASE_RANGE + " for read " + r.getReadName());
}
baseRanges = BaseRange.invert(baseRanges, r.getReadLength());
r.setReadBases(BaseRange.getBytesForBaseRange(baseRanges, r.getReadBases()));
r.setBaseQualities(BaseRange.getBytesForBaseRange(baseRanges, r.getBaseQualities()));
}

private void validateBaseRanges(List<BaseRange> baseRanges) {
int prevEnd = 0;
for (BaseRange b: baseRanges) {
if (b.getStart() < 1 || b.getEnd() < 1) {
throw new IllegalArgumentException("Base ranges must be 1 or greater: " + BASE_RANGE);
}
if (b.getStart() > b.getEnd()) {
throw new IllegalArgumentException("Base range start must be less than or equal to end: " + BASE_RANGE);
}
if (b.getStart() <= prevEnd) {
throw new IllegalArgumentException("Base ranges must not overlap: " + BASE_RANGE);
}
prevEnd = b.getEnd();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,23 +40,14 @@
import java.util.Arrays;

@CommandLineProgramProperties(summary = "", oneLineSummary = "", programGroup = DropSeq.class)
public class PolyATrimmer extends CommandLineProgram {
public class PolyATrimmer extends AbstractTrimmerClp {

private final Log log = Log.getInstance(PolyATrimmer.class);

// In debug mode, print a message if there is this much adapter match but no
// poly A found.
private static final int NO_POLY_A_ADAPTER_DEBUG_THRESHOLD = 6;

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to analyze.")
public File INPUT;

@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output BAM file")
public File OUTPUT;

@Argument(doc = "The output summary statistics", optional = true)
public File OUTPUT_SUMMARY;

@Argument(shortName = "NEW", doc="The old polyA trimmer looks for the longest run of As that is at least NUM_BASES" +
" long and has no more than MISMATCHES bases that are not A. For the new polyA trimmer, it is assumed that " +
"the polyA run either extends to the end of the read, or to the beginning of adapter sequence at the end of " +
Expand Down Expand Up @@ -109,7 +100,6 @@ public class PolyATrimmer extends CommandLineProgram {
@Argument(doc = "When looking for poly A, allow this fraction of bases not to be A (new trim algo)")
public double MAX_POLY_A_ERROR_RATE = 0.1;

private Integer readsTrimmed = 0;
private int readsCompletelyTrimmed = 0;
final private Histogram<Integer> numBasesTrimmed = new Histogram<>();

Expand Down Expand Up @@ -139,36 +129,39 @@ protected int doWork() {
else
polyAFinder = simplePolyAFinder;
for (SAMRecord r : bamReader) {
final SimplePolyAFinder.PolyARun polyARun = polyAFinder.getPolyAStart(r);
final int polyAStart = polyARun.startPos;

if (log.isEnabled(Log.LogLevel.DEBUG)) {
final PolyAFinder.PolyARun simple;
final PolyAFinder.PolyARun withAdapter;
if (USE_NEW_TRIMMER) {
withAdapter = polyARun;
simple = simplePolyAFinder.getPolyAStart(r);
} else {
simple = polyARun;
withAdapter = polyAWithAdapterFinder.getPolyAStart(r);
if (shouldTrim(r)) {
final SimplePolyAFinder.PolyARun polyARun = polyAFinder.getPolyAStart(r);
final int polyAStart = polyARun.startPos;

if (log.isEnabled(Log.LogLevel.DEBUG)) {
final PolyAFinder.PolyARun simple;
final PolyAFinder.PolyARun withAdapter;
if (USE_NEW_TRIMMER) {
withAdapter = polyARun;
simple = simplePolyAFinder.getPolyAStart(r);
} else {
simple = polyARun;
withAdapter = polyAWithAdapterFinder.getPolyAStart(r);
}
logTrimDifference(simple, withAdapter, r);
}
logTrimDifference(simple, withAdapter, r);
}

hardClipPolyAFromRecord(r, polyAStart);
// Note that if there wasn't a polyA run found, then adapter match is not recorded
if (LENGTH_TAG != null && polyARun.length > 0 && !polyARun.isNoMatch()) {
r.setAttribute(LENGTH_TAG, polyARun.length);
}
if (ADAPTER_TAG != null && polyARun.adapterLength > 0 && !polyARun.isNoMatch()) {
r.setAttribute(ADAPTER_TAG, polyARun.adapterLength);
hardClipPolyAFromRecord(r, polyAStart);
// Note that if there wasn't a polyA run found, then adapter match is not recorded
if (LENGTH_TAG != null && polyARun.length > 0 && !polyARun.isNoMatch()) {
r.setAttribute(LENGTH_TAG, polyARun.length);
}
if (ADAPTER_TAG != null && polyARun.adapterLength > 0 && !polyARun.isNoMatch()) {
r.setAttribute(ADAPTER_TAG, polyARun.adapterLength);
}
}
writer.addAlignment(r);
progress.record(r);
++numReadsTotal;
}
CloserUtil.close(bamReader);
writer.close();
log.info("Total " + progress.getCount() + " reads processed.");
log.info("Total " + numReadsTotal + " reads processed.");
log.info("Number of reads trimmed: ", this.readsTrimmed);
log.info("Number of reads completely trimmed: ", this.readsCompletelyTrimmed);
log.debug(String.format("differences: %d; old didn't clip: %d; new didn't clip: %d", numDiffs, numOldDidntClip,
Expand Down
Loading
Loading