diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SequenceBaseEnum.java b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SequenceBaseEnum.java index 71265418..269c7124 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SequenceBaseEnum.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/SequenceBaseEnum.java @@ -37,7 +37,7 @@ public enum SequenceBaseEnum { T('T'), N('N'); - private final char base; + public final char base; SequenceBaseEnum(char b) { this.base=b; diff --git a/src/java/org/broadinstitute/dropseqrna/readtrimming/AbstractTrimmerClp.java b/src/java/org/broadinstitute/dropseqrna/readtrimming/AbstractTrimmerClp.java index 3e259c54..68ad0b06 100644 --- a/src/java/org/broadinstitute/dropseqrna/readtrimming/AbstractTrimmerClp.java +++ b/src/java/org/broadinstitute/dropseqrna/readtrimming/AbstractTrimmerClp.java @@ -27,6 +27,7 @@ import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.Histogram; import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.dropseqrna.cmdline.CustomCommandLineValidationHelper; import picard.cmdline.CommandLineProgram; import picard.cmdline.StandardOptionDefinitions; @@ -49,7 +50,7 @@ public abstract class AbstractTrimmerClp extends CommandLineProgram { public File OUTPUT_SUMMARY; @Argument(doc = "Which reads to trim. 0: unpaired reads; 1: first of pair; 2: second of pair") public List WHICH_READ = new ArrayList<>(Arrays.asList(0)); - protected Integer readsTrimmed = 0; + protected int readsTrimmed = 0; protected int numReadsTotal = 0; protected final Histogram numBasesTrimmed = new Histogram<>(); @@ -65,4 +66,16 @@ protected boolean shouldTrim(final SAMRecord r) { } return false; } + + @Override + protected String[] customCommandLineValidation() { + final ArrayList list = new ArrayList<>(1); + if (!VALID_WHICH_READ.containsAll(WHICH_READ)) { + list.add("WHICH_READ must be one of " + VALID_WHICH_READ); + } + if (WHICH_READ.isEmpty()) { + list.add("WHICH_READ must be specified"); + } + return CustomCommandLineValidationHelper.makeValue(super.customCommandLineValidation(), list); + } } diff --git a/src/java/org/broadinstitute/dropseqrna/readtrimming/TrimHomopolymerStartingSequence.java b/src/java/org/broadinstitute/dropseqrna/readtrimming/TrimHomopolymerStartingSequence.java new file mode 100644 index 00000000..32d655e2 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/readtrimming/TrimHomopolymerStartingSequence.java @@ -0,0 +1,172 @@ +/* + * MIT License + * + * Copyright 2025 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.readtrimming; + +import htsjdk.samtools.*; +import htsjdk.samtools.metrics.MetricBase; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.dropseqrna.barnyard.digitalallelecounts.SequenceBaseEnum; +import org.broadinstitute.dropseqrna.cmdline.CustomCommandLineValidationHelper; +import org.broadinstitute.dropseqrna.cmdline.DropSeq; +import org.broadinstitute.dropseqrna.utils.SamHeaderUtil; + +import java.util.ArrayList; +import java.util.Arrays; + +@CommandLineProgramProperties(summary = "Trim homopolymer run from beginning of reads. If the entire read" + + " is a homopolymer run, instead of trimming all bases are set to Q3.", + oneLineSummary = "Trim the given sequence from the beginning of reads", + programGroup = DropSeq.class) +public class TrimHomopolymerStartingSequence +extends AbstractTrimmerClp { + public static final byte FULL_TRIM_QUALITY_SCORE = 3; + private final Log log = Log.getInstance(TrimHomopolymerStartingSequence.class); + private int readsCompletelyTrimmed = 0; + + @Argument(doc = "How many mismatches are acceptable in the homopolymer run.") + public int MISMATCHES = 1; + + @Argument(doc = "How many bases of homopolymer qualifies as a run to be trimmed.") + public int NUM_BASES = 6; + + @Argument(doc = "The base to trim.") + public SequenceBaseEnum BASE = SequenceBaseEnum.T; + private byte BASE_BYTE; + + @Override + protected String[] customCommandLineValidation() { + final ArrayList list = new ArrayList<>(1); + if (NUM_BASES < 1) { + list.add("NUM_BASES must be greater than 0."); + } + if (MISMATCHES < 0) { + list.add("MISMATCHES must be greater than or equal to 0."); + } + return CustomCommandLineValidationHelper.makeValue(super.customCommandLineValidation(), list); + } + + @Override + protected int doWork() { + IOUtil.assertFileIsReadable(INPUT); + IOUtil.assertFileIsWritable(OUTPUT); + BASE_BYTE = StringUtil.charToByte(BASE.base); + if (!SequenceUtil.isValidBase(BASE_BYTE)) { + throw new IllegalArgumentException(String.format("BASE (%c) must be a valid base, one of ACGT.", BASE)); + } + final ProgressLogger progress = new ProgressLogger(log); + + SamReader bamReader = SamReaderFactory.makeDefault().open(this.INPUT); + SAMFileHeader header = bamReader.getFileHeader(); + SamHeaderUtil.addPgRecord(header, this); + SAMFileWriter writer= new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, OUTPUT); + for (SAMRecord r : bamReader) { + if (shouldTrim(r)) { + maybeClip(r); + } + writer.addAlignment(r); + progress.record(r); + this.numReadsTotal++; + } + + CloserUtil.close(bamReader); + + writer.close(); + log.info("Number of reads trimmed: " + this.readsTrimmed, " total reads: " + this.numReadsTotal); + if (this.OUTPUT_SUMMARY!=null) writeSummary(); + + return 0; + } + + private void maybeClip(final SAMRecord r) { + final byte[] readBases = r.getReadBases(); + final int trimPosition = getTrimPosition(readBases); + if (trimPosition > 0) { + if (trimPosition == readBases.length) { + // if the entire read is a homopolymer run, set all qualities to Q3 + byte[] value = new byte[readBases.length]; + Arrays.fill(value, FULL_TRIM_QUALITY_SCORE); + r.setBaseQualities(value); + this.readsCompletelyTrimmed++; + } else { + final byte[] newReadBases = new byte[readBases.length - trimPosition]; + System.arraycopy(readBases, trimPosition, newReadBases, 0, newReadBases.length); + r.setReadBases(newReadBases); + final byte[] readQuals = r.getBaseQualities(); + final byte[] newReadQuals = new byte[readQuals.length - trimPosition]; + System.arraycopy(readQuals, trimPosition, newReadQuals, 0, newReadQuals.length); + r.setBaseQualities(newReadQuals); + } + this.readsTrimmed++; + } + this.numBasesTrimmed.increment(trimPosition); + } + + private int getTrimPosition(final byte[] readBases) { + if (readBases.length < NUM_BASES) { + return 0; + } + int trimPosition = readBases.length; + int numMismatches = 0; + for (int i = 0; i < readBases.length; i++) { + if (!SequenceUtil.basesEqual(readBases[i], BASE_BYTE)) { + if (++numMismatches > MISMATCHES) { + trimPosition = i; + break; + } + } + } + if (trimPosition < NUM_BASES) { + return 0; + } + if (numMismatches == trimPosition) { + // handle case in which all that was seen were mismatches. This shouldn't happen unless MISMATCHES > 1 + return 0; + } + return trimPosition; + } + + private void writeSummary() { + final TrimMetric metric = new TrimMetric(); + metric.TRIMMED_READS = this.readsTrimmed; + metric.TOTAL_READS = this.numReadsTotal; + metric.COMPLETED_TRIMMED_READS = this.readsCompletelyTrimmed; + MetricsFile mf = new MetricsFile<>(); + mf.addMetric(metric); + mf.addHistogram(this.numBasesTrimmed); + mf.write(this.OUTPUT_SUMMARY); + } + + public static class TrimMetric extends MetricBase { + public long TRIMMED_READS = 0; + public long TOTAL_READS = 0; + public long COMPLETED_TRIMMED_READS = 0; + } + + public static void main(String[] args) { + new TrimHomopolymerStartingSequence().instanceMainWithExit(args); + } +} diff --git a/src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java b/src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java index 03037747..8f60ee79 100644 --- a/src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java +++ b/src/java/org/broadinstitute/dropseqrna/readtrimming/TrimStartingSequence.java @@ -135,12 +135,6 @@ protected String[] customCommandLineValidation() { if (MISMATCH_RATE != null && (MISMATCH_RATE < 0 || MISMATCH_RATE >= 1)) { list.add("MISMATCH_RATE must be >= 0 and < 1"); } - if (!VALID_WHICH_READ.containsAll(WHICH_READ)) { - list.add("WHICH_READ must be one of " + VALID_WHICH_READ); - } - if (WHICH_READ.isEmpty()) { - list.add("WHICH_READ must be specified"); - } return CustomCommandLineValidationHelper.makeValue(super.customCommandLineValidation(), list); } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/readtrimming/ClipReadsTest.java b/src/tests/java/org/broadinstitute/dropseqrna/readtrimming/ClipReadsTest.java index 02e78356..5cf60956 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/readtrimming/ClipReadsTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/readtrimming/ClipReadsTest.java @@ -46,7 +46,6 @@ public void testClipReads() { final ClipReads clp = new ClipReads(); clp.INPUT = PAIRED_INPUT; clp.OUTPUT = TestUtils.getTempReportFile("ClipReadsTest.", ".sam"); - clp.OUTPUT.deleteOnExit(); clp.TMP_DIR = Collections.singletonList(tempDir); clp.BASE_RANGE = "1-16:17-28"; clp.WHICH_READ = Collections.singletonList(AbstractTrimmerClp.FIRST_OF_PAIR); diff --git a/src/tests/java/org/broadinstitute/dropseqrna/readtrimming/TrimHomopolymerStartingSequenceTest.java b/src/tests/java/org/broadinstitute/dropseqrna/readtrimming/TrimHomopolymerStartingSequenceTest.java new file mode 100644 index 00000000..a70011c2 --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/readtrimming/TrimHomopolymerStartingSequenceTest.java @@ -0,0 +1,91 @@ +/* + * MIT License + * + * Copyright 2025 Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.broadinstitute.dropseqrna.readtrimming; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.util.CloserUtil; +import org.broadinstitute.dropseqrna.utils.TestUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; + +public class TrimHomopolymerStartingSequenceTest { + private static final File TESTDATA_DIR = new File("testdata/org/broadinstitute/dropseq/readtrimming"); + private static final File INPUT = new File(TESTDATA_DIR, "prePolyTTrim.paired.sam"); + + @Test + public void testBasic() { + final File tempDir = TestUtils.createTempDirectory("TrimHomopolymerStartingSequenceTest."); + final TrimHomopolymerStartingSequence clp = new TrimHomopolymerStartingSequence(); + clp.INPUT = INPUT; + clp.OUTPUT = TestUtils.getTempReportFile("TrimHomopolymerStartingSequenceTest.", ".sam"); + clp.WHICH_READ = Collections.singletonList(AbstractTrimmerClp.FIRST_OF_PAIR); + clp.TMP_DIR = Collections.singletonList(tempDir); + Assert.assertEquals(clp.doWork(), 0); + + final SamReader inputReader = SamReaderFactory.makeDefault().open(clp.INPUT); + final SamReader actualReader = SamReaderFactory.makeDefault().open(clp.OUTPUT); + final SAMRecordIterator inputIterator = inputReader.iterator(); + final SAMRecordIterator actualIterator = actualReader.iterator(); + while (inputIterator.hasNext() && actualIterator.hasNext()) { + final SAMRecord inputRecord = inputIterator.next(); + final SAMRecord actualRecord = actualIterator.next(); + final String readName = actualRecord.getReadName(); + Assert.assertEquals(readName, inputRecord.getReadName()); + Assert.assertEquals(actualRecord.getFirstOfPairFlag(), inputRecord.getFirstOfPairFlag(), readName); + if (actualRecord.getFirstOfPairFlag()) { + final String[] readNameFields = readName.split(":"); + if (readNameFields[0].equals("trimmed")) { + final int trimLength = Integer.parseInt(readNameFields[1]); + final String inputRead = inputRecord.getReadString().substring(trimLength); + Assert.assertEquals(actualRecord.getReadString(), inputRead, readName); + final String inputQual = inputRecord.getBaseQualityString().substring(trimLength); + Assert.assertEquals(actualRecord.getBaseQualityString(), inputQual, readName); + } else if (readNameFields[0].equals("notrim")) { + Assert.assertEquals(actualRecord, inputRecord); + } else if (readNameFields[0].equals("fulltrim")) { + Assert.assertEquals(actualRecord.getReadString(), inputRecord.getReadString(), readName); + byte[] expectedQuals = new byte[inputRecord.getReadLength()]; + Arrays.fill(expectedQuals, TrimHomopolymerStartingSequence.FULL_TRIM_QUALITY_SCORE); + Assert.assertEquals(actualRecord.getBaseQualities(), expectedQuals, readName); + } else { + Assert.fail("Unexpected read name: " + readName); + } + } else { + Assert.assertEquals(actualRecord, inputRecord); + } + } + Assert.assertFalse(inputIterator.hasNext()); + Assert.assertFalse(actualIterator.hasNext()); + CloserUtil.close(inputReader); + CloserUtil.close(actualReader); + + } +} diff --git a/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.sam b/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.sam index 3fb8c6b6..dbc892ed 100644 --- a/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.sam +++ b/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.sam @@ -1,8 +1,8 @@ @HD VN:1.6 GO:none SO:queryname -@RG ID:A SM:v29_10X-GEMX-3P_ic_rxn1 LB:v29_10X-GEMX-3P_ic_rxn1 PL:ILLUMINA PU:22FKMMLT4.ACCAGACAAC.CCTAGTTCCT.1 CN:BI -LH00453:184:22FKMMLT4:1:1101:10031:9853 77 * 0 0 * * 0 0 ACCGGCTGTGAGACTACGCTGCGCAGCCTTGTTTTTTTTTTTTTTTTTTTTTCTGACCGTCTTCGAACCTCCCCCCTTCGTTCTTGCTTAATGCCAACATTCTTCGCAAATGCTTTCTATCTTATCCTTCTTTTACCCACCCAAAAACCT -I9II9IIIIII9IIIIIII9IIIIII-9-------9I9999999I9I99-----99--9-99--99---------99--9--99--99-9999-99-9---99--99999-999---------99I--99---999---9---9----9 XC:Z:ACCGGCTGTGAGACTA RG:Z:A -LH00453:184:22FKMMLT4:1:1101:10031:9853 141 * 0 0 * * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG I--I----99II9I9I-I-9--I9I99999-I--99I----9-99------------9-99-9---9--99--9999-99-9-99----999-9--999--9-9999-9--999999-I99-I9-9I99I99---9-9999999-9999- XC:Z:ACCGGCTGTGAGACTA RG:Z:A XM:Z:CGCTGCGCAGCC -LH00453:184:22FKMMLT4:1:1101:31945:11394 77 * 0 0 * * 0 0 ATGCTAGTCACCCTGCTCTGCACTAGACTTTTTTTTTTTTTTTTTTTTTTGTAGTCACCAAAACTGTGAGGGGAGAAACCCCAAGCTCTCCCGCAAACCTCTCTCTCTCGTCACAACTATGTCTATTCCCCTTAACTTTTTCAAACAACC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII---9--9I-I99I9----99I99--99I9---99-99-99--9-9-9--9-----9I--------9-9-9---9-----9------9------9------ XC:Z:ATGCTAGTCACCCTGC RG:Z:A -LH00453:184:22FKMMLT4:1:1101:31945:11394 141 * 0 0 * * 0 0 AGAATGAAGGCAATCAATATTTTTCACTTGCTTCTCCACTAAGTTTGTTATTGCACCCATTATTCAAACTTTCAAATTCCTGGTTGAAAAAATGCTCATCTTCAAAATTTACTATTAACAGCCACAATTGTCACCATATATATATATTTG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:ATGCTAGTCACCCTGC RG:Z:A XM:Z:TCTGCACTAGAC -LH00453:184:22FKMMLT4:1:1101:6236:13425 77 * 0 0 * * 0 0 CTGTCATTCAGGAACGTGTGTGCCCAGCTTTTTTTTTTTTTTTTTTTTTTTTTTTTCGACCAAGAAACCCCCCACCCAACAAAAAAAAACCAACACCCACAAACACTTTTCCGCTTTAAAACACCCACAACAAAAAATTTTTTTCCCAAG IIIII9IIIIII-IIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIII-I--I9-99I-9I-I-9---I-I9--I9I-9I9---99-9-9---999II--9I9-9---9-9---------99--9--I9--9-9---9----- XC:Z:CTGTCATTCAGGAACG RG:Z:A -LH00453:184:22FKMMLT4:1:1101:6236:13425 141 * 0 0 * * 0 0 CAATGGTGCCTGTCCAGATTTTCTGACAGTCACCTCAGGAAATTGACACAGACACGCTGTGATCCTGGTTTCTGTACAGTTACTTTGTGTGTTACTCAGCACCTGGGCTGTTAAGGGTATTATCAAGCGTGAAAAAATAAAGACAAAAAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:CTGTCATTCAGGAACG RG:Z:A XM:Z:TGTGTGCCCAGC +@RG ID:A SM:rxn1 LB:rxn1 PL:ILLUMINA PU:123456789.ACCAGACAAC.CCTAGTTCCT.1 CN:BI +LH00453:184:123456789:1:1101:10031:9853 77 * 0 0 * * 0 0 ACCGGCTGTGAGACTACGCTGCGCAGCCTTGTTTTTTTTTTTTTTTTTTTTTCTGACCGTCTTCGAACCTCCCCCCTTCGTTCTTGCTTAATGCCAACATTCTTCGCAAATGCTTTCTATCTTATCCTTCTTTTACCCACCCAAAAACCT -I9II9IIIIII9IIIIIII9IIIIII-9-------9I9999999I9I99-----99--9-99--99---------99--9--99--99-9999-99-9---99--99999-999---------99I--99---999---9---9----9 XC:Z:ACCGGCTGTGAGACTA RG:Z:A +LH00453:184:123456789:1:1101:10031:9853 141 * 0 0 * * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG I--I----99II9I9I-I-9--I9I99999-I--99I----9-99------------9-99-9---9--99--9999-99-9-99----999-9--999--9-9999-9--999999-I99-I9-9I99I99---9-9999999-9999- XC:Z:ACCGGCTGTGAGACTA RG:Z:A XM:Z:CGCTGCGCAGCC +LH00453:184:123456789:1:1101:31945:11394 77 * 0 0 * * 0 0 ATGCTAGTCACCCTGCTCTGCACTAGACTTTTTTTTTTTTTTTTTTTTTTGTAGTCACCAAAACTGTGAGGGGAGAAACCCCAAGCTCTCCCGCAAACCTCTCTCTCTCGTCACAACTATGTCTATTCCCCTTAACTTTTTCAAACAACC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII---9--9I-I99I9----99I99--99I9---99-99-99--9-9-9--9-----9I--------9-9-9---9-----9------9------9------ XC:Z:ATGCTAGTCACCCTGC RG:Z:A +LH00453:184:123456789:1:1101:31945:11394 141 * 0 0 * * 0 0 AGAATGAAGGCAATCAATATTTTTCACTTGCTTCTCCACTAAGTTTGTTATTGCACCCATTATTCAAACTTTCAAATTCCTGGTTGAAAAAATGCTCATCTTCAAAATTTACTATTAACAGCCACAATTGTCACCATATATATATATTTG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:ATGCTAGTCACCCTGC RG:Z:A XM:Z:TCTGCACTAGAC +LH00453:184:123456789:1:1101:6236:13425 77 * 0 0 * * 0 0 CTGTCATTCAGGAACGTGTGTGCCCAGCTTTTTTTTTTTTTTTTTTTTTTTTTTTTCGACCAAGAAACCCCCCACCCAACAAAAAAAAACCAACACCCACAAACACTTTTCCGCTTTAAAACACCCACAACAAAAAATTTTTTTCCCAAG IIIII9IIIIII-IIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIII-I--I9-99I-9I-I-9---I-I9--I9I-9I9---99-9-9---999II--9I9-9---9-9---------99--9--I9--9-9---9----- XC:Z:CTGTCATTCAGGAACG RG:Z:A +LH00453:184:123456789:1:1101:6236:13425 141 * 0 0 * * 0 0 CAATGGTGCCTGTCCAGATTTTCTGACAGTCACCTCAGGAAATTGACACAGACACGCTGTGATCCTGGTTTCTGTACAGTTACTTTGTGTGTTACTCAGCACCTGGGCTGTTAAGGGTATTATCAAGCGTGAAAAAATAAAGACAAAAAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:CTGTCATTCAGGAACG RG:Z:A XM:Z:TGTGTGCCCAGC diff --git a/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.short_read.sam b/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.short_read.sam index 36d11787..8000fe71 100644 --- a/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.short_read.sam +++ b/testdata/org/broadinstitute/dropseq/readtrimming/paired_end.28_technical.short_read.sam @@ -1,8 +1,8 @@ @HD VN:1.6 GO:none SO:queryname -@RG ID:A SM:v29_10X-GEMX-3P_ic_rxn1 LB:v29_10X-GEMX-3P_ic_rxn1 PL:ILLUMINA PU:22FKMMLT4.ACCAGACAAC.CCTAGTTCCT.1 CN:BI -LH00453:184:22FKMMLT4:1:1101:10031:9853 77 * 0 0 * * 0 0 ACCGGCTGTGAGACTACGCTGCGCAGCCT -I9II9IIIIII9IIIIIII9IIIIII-9 XC:Z:ACCGGCTGTGAGACTA RG:Z:A -LH00453:184:22FKMMLT4:1:1101:10031:9853 141 * 0 0 * * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG I--I----99II9I9I-I-9--I9I99999-I--99I----9-99------------9-99-9---9--99--9999-99-9-99----999-9--999--9-9999-9--999999-I99-I9-9I99I99---9-9999999-9999- XC:Z:ACCGGCTGTGAGACTA RG:Z:A XM:Z:CGCTGCGCAGCC -LH00453:184:22FKMMLT4:1:1101:31945:11394 77 * 0 0 * * 0 0 ATGCTAGTCACCCTGCTCTGCACTAGACTTTTTTTTTTTTTTTTTTTTTTGTAGTCACCAAAACTGTGAGGGGAGAAACCCCAAGCTCTCCCGCAAACCTCTCTCTCTCGTCACAACTATGTCTATTCCCCTTAACTTTTTCAAACAACC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII---9--9I-I99I9----99I99--99I9---99-99-99--9-9-9--9-----9I--------9-9-9---9-----9------9------9------ XC:Z:ATGCTAGTCACCCTGC RG:Z:A -LH00453:184:22FKMMLT4:1:1101:31945:11394 141 * 0 0 * * 0 0 AGAATGAAGGCAATCAATATTTTTCACTTGCTTCTCCACTAAGTTTGTTATTGCACCCATTATTCAAACTTTCAAATTCCTGGTTGAAAAAATGCTCATCTTCAAAATTTACTATTAACAGCCACAATTGTCACCATATATATATATTTG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:ATGCTAGTCACCCTGC RG:Z:A XM:Z:TCTGCACTAGAC -LH00453:184:22FKMMLT4:1:1101:6236:13425 77 * 0 0 * * 0 0 CTGTCATTCAGGAACGTGTG IIIII9IIIIII-IIIIIII XC:Z:CTGTCATTCAGGAACG RG:Z:A -LH00453:184:22FKMMLT4:1:1101:6236:13425 141 * 0 0 * * 0 0 CAATGGTGCCTGTCCAGATTTTCTGACAGTCACCTCAGGAAATTGACACAGACACGCTGTGATCCTGGTTTCTGTACAGTTACTTTGTGTGTTACTCAGCACCTGGGCTGTTAAGGGTATTATCAAGCGTGAAAAAATAAAGACAAAAAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:CTGTCATTCAGGAACG RG:Z:A XM:Z:TGTGTGCCCAGC +@RG ID:A SM:rxn1 LB:rxn1 PL:ILLUMINA PU:123456789.ACCAGACAAC.CCTAGTTCCT.1 CN:BI +LH00453:184:123456789:1:1101:10031:9853 77 * 0 0 * * 0 0 ACCGGCTGTGAGACTACGCTGCGCAGCCT -I9II9IIIIII9IIIIIII9IIIIII-9 XC:Z:ACCGGCTGTGAGACTA RG:Z:A +LH00453:184:123456789:1:1101:10031:9853 141 * 0 0 * * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG I--I----99II9I9I-I-9--I9I99999-I--99I----9-99------------9-99-9---9--99--9999-99-9-99----999-9--999--9-9999-9--999999-I99-I9-9I99I99---9-9999999-9999- XC:Z:ACCGGCTGTGAGACTA RG:Z:A XM:Z:CGCTGCGCAGCC +LH00453:184:123456789:1:1101:31945:11394 77 * 0 0 * * 0 0 ATGCTAGTCACCCTGCTCTGCACTAGACTTTTTTTTTTTTTTTTTTTTTTGTAGTCACCAAAACTGTGAGGGGAGAAACCCCAAGCTCTCCCGCAAACCTCTCTCTCTCGTCACAACTATGTCTATTCCCCTTAACTTTTTCAAACAACC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII---9--9I-I99I9----99I99--99I9---99-99-99--9-9-9--9-----9I--------9-9-9---9-----9------9------9------ XC:Z:ATGCTAGTCACCCTGC RG:Z:A +LH00453:184:123456789:1:1101:31945:11394 141 * 0 0 * * 0 0 AGAATGAAGGCAATCAATATTTTTCACTTGCTTCTCCACTAAGTTTGTTATTGCACCCATTATTCAAACTTTCAAATTCCTGGTTGAAAAAATGCTCATCTTCAAAATTTACTATTAACAGCCACAATTGTCACCATATATATATATTTG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIII9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:ATGCTAGTCACCCTGC RG:Z:A XM:Z:TCTGCACTAGAC +LH00453:184:123456789:1:1101:6236:13425 77 * 0 0 * * 0 0 CTGTCATTCAGGAACGTGTG IIIII9IIIIII-IIIIIII XC:Z:CTGTCATTCAGGAACG RG:Z:A +LH00453:184:123456789:1:1101:6236:13425 141 * 0 0 * * 0 0 CAATGGTGCCTGTCCAGATTTTCTGACAGTCACCTCAGGAAATTGACACAGACACGCTGTGATCCTGGTTTCTGTACAGTTACTTTGTGTGTTACTCAGCACCTGGGCTGTTAAGGGTATTATCAAGCGTGAAAAAATAAAGACAAAAAC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:CTGTCATTCAGGAACG RG:Z:A XM:Z:TGTGTGCCCAGC diff --git a/testdata/org/broadinstitute/dropseq/readtrimming/prePolyTTrim.paired.sam b/testdata/org/broadinstitute/dropseq/readtrimming/prePolyTTrim.paired.sam new file mode 100644 index 00000000..bda74776 --- /dev/null +++ b/testdata/org/broadinstitute/dropseq/readtrimming/prePolyTTrim.paired.sam @@ -0,0 +1,9 @@ +@HD VN:1.6 GO:none SO:queryname +@RG ID:A SM:rxn1 LB:rxn1 PL:ILLUMINA PU:123456789.ACCAGACAAC.CCTAGTTCCT.1 CN:BI +@CO tests ordered by read name +fulltrim 77 * 0 0 * * 0 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII999999-999999999--99999-99I999IIII9 XC:Z:CGAATAGGTCTTGAGT RG:Z:A +fulltrim 141 * 0 0 * * 0 0 TCTGGAACATGGATTTGTGTTCACCTTAAATGTGAAAATAAATCCT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII XC:Z:CGAATAGGTCTTGAGT RG:Z:A ZL:i:104 XM:Z:ACGATTTACAGC ZP:i:47 +notrim:tooshort 77 * 0 0 * * 0 0 TCCACATCTTCCGGAAAAAATCCAGGTCTTCCAGCCAATATATGTCTTCCTGAAGATCCACGTCTTCCAGAAAATCCATGTCTTCCAGAAAATCCATGTCTTCCAGTAACCTCCCAGTCTTC IIIIIIIIIIIIIIIIIIIIIIIIIII-I9I9II99I--II9999-99-9999999I9-99I9-99-99I99-I---999I--9--99-999-9I-999I-9999I-9II99-9I9I999II XC:Z:GCTAGTCAGTGCCTGG RG:Z:A +notrim:tooshort 141 * 0 0 * * 0 0 AAGACACAAGTAGGCTGGAAGACATTAATTTGATGGAAGACATGGCTTTGTTGGAAGACGTGGATTTGCTGGAAGACACGGATTTCCTGGAAGACCTGGATTTTTCGGAAGCTATGGATTTGAGGGAAGACAAGGATTTTCTGGAAGACA IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9I9IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IIIIIIIIII XC:Z:GCTAGTCAGTGCCTGG RG:Z:A XM:Z:TCTTCCAGTCAA +trimmed:24 77 * 0 0 * * 0 0 TTGTTTTTTTTTTTTTTTTTTTTTCTGACCGTCTTCGAACCTCCCCCCTTCGTTCTTGCTTAATGCCAACATTCTTCGCAAATGCTTTCTATCTTATCCTTCTTTTACCCACCCAAAAACCT 9-------9I9999999I9I99-----99--9-99--99---------99--9--99--99-9999-99-9---99--99999-999---------99I--99---999---9---9----9 XC:Z:ACCGGCTGTGAGACTA RG:Z:A +trimmed:24 141 * 0 0 * * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG I--I----99II9I9I-I-9--I9I99999-I--99I----9-99------------9-99-9---9--99--9999-99-9-99----999-9--999--9-9999-9--999999-I99-I9-9I99I99---9-9999999-9999- XC:Z:ACCGGCTGTGAGACTA RG:Z:A XM:Z:CGCTGCGCAGCC