Skip to content

Commit

Permalink
Enable TrimStartingSequence to process paired-end data
Browse files Browse the repository at this point in the history
  • Loading branch information
alecw committed Jan 2, 2025
1 parent 6c0dc16 commit 01fb674
Show file tree
Hide file tree
Showing 3 changed files with 2,063 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,19 @@
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;

@CommandLineProgramProperties(summary = "Trim the given sequence from the beginning of reads",
oneLineSummary = "Trim the given sequence from the beginning of reads",
programGroup = DropSeq.class)
public class TrimStartingSequence extends CommandLineProgram {
public static final String DEFAULT_TRIM_TAG = "ZS";
static final int UNPAIRED = 0;
static final int FIRST_OF_PAIR = 1;
static final int SECOND_OF_PAIR = 2;
private static final Set<Integer> VALID_WHICH_READ = CollectionUtil.makeSet(UNPAIRED, FIRST_OF_PAIR, SECOND_OF_PAIR);

private final Log log = Log.getInstance(TrimStartingSequence.class);

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to analyze.")
Expand Down Expand Up @@ -85,6 +92,9 @@ public class TrimStartingSequence extends CommandLineProgram {
optional = true, mutex = {"LEGACY"})
public String LENGTH_TAG;

@Argument(doc="Which reads to trim. 0: unpaired reads; 1: first of pair; 2: second of pair")
public List<Integer> WHICH_READ = new ArrayList<>(Arrays.asList(0));

private Integer readsTrimmed=0;
private int numReadsTotal=0;
private final Histogram<Integer> numBasesTrimmed= new Histogram<>();
Expand All @@ -108,7 +118,7 @@ protected int doWork() {
TrimSequenceTemplate t = new TrimSequenceTemplate(this.SEQUENCE);

for (SAMRecord r: bamReader) {
SAMRecord rr = hardClipBarcodeFromRecordLegacy(r, t, this.NUM_BASES, this.MISMATCHES);
SAMRecord rr = shouldTrim(r)? hardClipBarcodeFromRecordLegacy(r, t, this.NUM_BASES, this.MISMATCHES): r;
writer.addAlignment(rr);
progress.record(r);
this.numReadsTotal++;
Expand All @@ -123,7 +133,7 @@ protected int doWork() {
}

for (SAMRecord r : bamReader) {
SAMRecord rr = hardClipBarcodeFromRecord(r, trimmer);
SAMRecord rr = shouldTrim(r)? hardClipBarcodeFromRecord(r, trimmer): r;
writer.addAlignment(rr);
progress.record(r);
this.numReadsTotal++;
Expand All @@ -139,12 +149,31 @@ protected int doWork() {
return 0;
}

private boolean shouldTrim(final SAMRecord r) {
if (WHICH_READ.contains(UNPAIRED) && r.getReadPairedFlag() == false) {
return true;
}
if (WHICH_READ.contains(FIRST_OF_PAIR) && r.getFirstOfPairFlag()) {
return true;

Check warning on line 157 in src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java

View check run for this annotation

Codecov / codecov/patch

src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java#L157

Added line #L157 was not covered by tests
}
if (WHICH_READ.contains(SECOND_OF_PAIR) && r.getSecondOfPairFlag()) {
return true;
}
return false;
}

@Override
protected String[] customCommandLineValidation() {
final ArrayList<String> list = new ArrayList<>(1);
if (MISMATCH_RATE != null && (MISMATCH_RATE < 0 || MISMATCH_RATE >= 1)) {
list.add("MISMATCH_RATE must be >= 0 and < 1");
}
if (!VALID_WHICH_READ.containsAll(WHICH_READ)) {
list.add("WHICH_READ must be one of " + VALID_WHICH_READ);

Check warning on line 172 in src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java

View check run for this annotation

Codecov / codecov/patch

src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java#L172

Added line #L172 was not covered by tests
}
if (WHICH_READ.isEmpty()) {
list.add("WHICH_READ must be specified");

Check warning on line 175 in src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java

View check run for this annotation

Codecov / codecov/patch

src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java#L175

Added line #L175 was not covered by tests
}
return CustomCommandLineValidationHelper.makeValue(super.customCommandLineValidation(), list);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@

import java.io.File;
import java.util.Arrays;
import java.util.Collections;

public class TrimStartingSequenceTest {
private static final File INPUT = new File("testdata/org/broadinstitute/dropseq/readtrimming/N701.subset.tagged_filtered.sam");
private static final File TEST_DATA_DIR = new File("testdata/org/broadinstitute/dropseq/readtrimming");
private static final File INPUT = new File(TEST_DATA_DIR, "N701.subset.tagged_filtered.sam");
private static final File PAIRED_INPUT = new File(TEST_DATA_DIR, "N701.paired.subset.tagged_filtered.sam");
public static final String LENGTH_TAG = "Zl";

@Test
Expand Down Expand Up @@ -66,6 +69,39 @@ public void testTrimStartingSequenceClp() {
CloserUtil.close(Arrays.asList(outputReader, inputReader));
}

@Test
public void testTrimStartingSequencePairedClp() {
final TrimStartingSequence clp = new TrimStartingSequence();
clp.INPUT = PAIRED_INPUT;
clp.OUTPUT = TestUtils.getTempReportFile("TrimStartingSequenceTest.", ".sam");
clp.OUTPUT_SUMMARY = TestUtils.getTempReportFile("TrimStartingSequenceTest.", ".summary");
clp.SEQUENCE = "AAGCAGTGGTATCAACGCAGAGTGAATGGG";
clp.MISMATCHES=0;
clp.NUM_BASES=5;
clp.WHICH_READ = Collections.singletonList(TrimStartingSequence.SECOND_OF_PAIR);
Assert.assertEquals(clp.doWork(), 0);

// Confirm that the expected stuff is trimmed.
final SamReader outputReader = SamReaderFactory.makeDefault().open(clp.OUTPUT);
final SAMRecordIterator outputIterator = outputReader.iterator();
final SamReader inputReader = SamReaderFactory.makeDefault().open(clp.INPUT);
for (final SAMRecord inputRec: inputReader) {
Assert.assertTrue(outputIterator.hasNext());
final SAMRecord outputRec = outputIterator.next();
if (inputRec.getSecondOfPairFlag()) {
final String inputBases = inputRec.getReadString();
final String outputBases = outputRec.getReadString();
Assert.assertTrue(inputBases.endsWith(outputBases));
final String trimmedBases = inputBases.substring(0, inputBases.length() - outputBases.length());
Assert.assertTrue(clp.SEQUENCE.endsWith(trimmedBases));
}
}
Assert.assertFalse(outputIterator.hasNext());
CloserUtil.close(Arrays.asList(outputReader, inputReader));
}



/**
* @param expectedResult -1: fully trimmed, i.e. bases unchanged but all Q2
* 0: not trimmed
Expand Down
Loading

0 comments on commit 01fb674

Please sign in to comment.