From 3df3e34bacf591eab34e012419c06798c4ebd9f6 Mon Sep 17 00:00:00 2001 From: Alec Wysoker Date: Thu, 27 Jun 2024 17:03:55 -0400 Subject: [PATCH 1/3] Add ability to collapse UMIs in UMICollection by edit distance in place --- .../digitalexpression/UMICollection.java | 43 +++++++++++++++---- .../MapBarcodesByEditDistance.java | 2 +- .../barnyard/DigitalExpressionTestUtil.java | 4 +- .../barnyard/DigitalExpressionTest.java | 27 ++++++------ .../digitalexpression/UMICollectionTest.java | 40 +++++++++++++++-- 5 files changed, 87 insertions(+), 29 deletions(-) diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java index de0dc8d7..e456a570 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java @@ -24,13 +24,7 @@ package org.broadinstitute.dropseqrna.barnyard.digitalexpression; import java.io.File; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; +import java.util.*; import org.broadinstitute.dropseqrna.barnyard.GatherMolecularBarcodeDistributionByGene; import org.broadinstitute.dropseqrna.utils.ObjectCounter; @@ -158,14 +152,45 @@ public int getDigitalExpression (final int minBCReadThreshold, final int editDis * For a list of molecular barcodes, collapse them by edit distance. * @param counts * @param editDistance - * @param threshold - * @return + * @return A new object counter with the molecular barcodes collapsed by edit distance. */ private ObjectCounter collapseByEditDistance (final ObjectCounter counts, final int editDistance) { ObjectCounter result = mbed.collapseAndMergeBarcodes(counts, false, editDistance); return (result); } + /** + * Collapse the molecular barcodes in this collection by edit distance. This only makes sense if reads != null, + * because otherwise, one could just call getMolecularBarcodeCountsCollapsed to get collapsed read counts. + * The reads are re-tagged with the collapsed molecular barcode, and re-grouped, and the molecular barcode counts are updated. + * @param editDistance + * @param molecularBarcodeTag + */ + public void collapseThisByEditDistance(final int editDistance, final String molecularBarcodeTag) { + if (reads == null) { + throw new IllegalStateException("It doesn't make sense to collapse in place without reads"); + } + if (editDistance < 1) { + throw new IllegalArgumentException("Edit distance must be at least 1"); + } + Map> collapsedToUncollapsed = mbed.collapseBarcodes(this.molecularBarcodeCounts, false, editDistance); + for (Map.Entry> entry: collapsedToUncollapsed.entrySet()) { + final String goodUmi = entry.getKey(); + final List umisToBeCollapsed = entry.getValue(); + final List readsForGoodUmi = reads.get(goodUmi); + for (final String umiToBeCollapsed : umisToBeCollapsed) { + final List readsToBeCollapsed = reads.get(umiToBeCollapsed); + readsToBeCollapsed.stream().forEach(r -> r.setAttribute(molecularBarcodeTag, goodUmi)); + readsForGoodUmi.addAll(readsToBeCollapsed); + reads.remove(umiToBeCollapsed); + } + } + molecularBarcodeCounts = new ObjectCounter<>(); + for (Map.Entry> entry: this.reads.entrySet()) { + molecularBarcodeCounts.setCount(entry.getKey(), entry.getValue().size()); + } + } + public static Collection parseUMICollectionFile (final File input) { IOUtil.assertFileIsReadable(input); diff --git a/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java b/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java index 9536e01c..ac4b7958 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java @@ -239,7 +239,7 @@ private boolean assertAllArraysSameLength (final Collection charArrays) * @param barcodes * @param findIndels * @param editDistance - * @return + * @return map from the core barcode to the list of barcodes that are collapsed into it. */ public Map> collapseBarcodes (final ObjectCounter barcodes, final boolean findIndels, final int editDistance) { List coreBarcodes = barcodes.getKeysOrderedByCount(true); diff --git a/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java b/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java index 39de60ca..fbfa932c 100644 --- a/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java +++ b/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java @@ -58,8 +58,8 @@ public class DigitalExpressionTestUtil { * */ - static final File IN_FILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene_retagged.bam"); - static final String [] barcodes ={"ATCAGGGACAGA", "AGGGAAAATTGA", "TTGCCTTACGCG", "TGGCGAAGAGAT", "TACAATTAAGGC"}; + public static final File IN_FILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene_retagged.bam"); + public static final String [] barcodes ={"ATCAGGGACAGA", "AGGGAAAATTGA", "TTGCCTTACGCG", "TGGCGAAGAGAT", "TACAATTAAGGC"}; /** * @return A digital expression file, which is marked as deleteOnExit diff --git a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java index 8bb239ff..f8a9255f 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java @@ -54,28 +54,29 @@ public class DigitalExpressionTest { // expected results for standard coding strand specific DGE private static final File EXPECTED_OUTFILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge.txt"); private static final File EXPECTED_OUTFILE_SUMMARY = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_summary.txt"); - private static final File EXPECTED_OUTFILE_LONG = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_long.txt"); + public static final File EXPECTED_OUTFILE_LONG = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_long.txt"); private static final File EXPECTED_OUTPUT_SINGLE_CELL=new File("testdata/org/broadinstitute/transcriptome/barnyard/1_cell.dge.txt"); // - private String GENE_NAME_TAG="gn"; - private String GENE_STRAND_TAG="gs"; - private String GENE_FUNCTION_TAG="gf"; - private StrandStrategy STRAND_STRATEGY = StrandStrategy.SENSE; - private List LOCUS_FUNCTION_LIST=new ArrayList<>(Arrays.asList(LocusFunction.CODING, LocusFunction.UTR)); - private String CELL_BARCODE_TAG="ZC"; - private String MOLECULAR_BARCODE_TAG = "XM"; - private int READ_MQ=10; - private Boolean USE_STRAND_INFO=true; + public static final String GENE_NAME_TAG="gn"; + public static final String GENE_STRAND_TAG="gs"; + public static final String GENE_FUNCTION_TAG="gf"; + public static final StrandStrategy STRAND_STRATEGY = StrandStrategy.SENSE; + public static final List LOCUS_FUNCTION_LIST=new ArrayList<>(Arrays.asList(LocusFunction.CODING, LocusFunction.UTR)); + public static final String CELL_BARCODE_TAG="ZC"; + public static final String MOLECULAR_BARCODE_TAG = "XM"; + public static final int READ_MQ=10; + private static final Boolean USE_STRAND_INFO=true; - private UMIIterator getUMIIterator (final File inFile) { + private static UMIIterator getUMIIterator (final File inFile) { List cellBarcodes = Arrays.asList(DigitalExpressionTestUtil.barcodes); UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG, - GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, - this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, true, cellBarcodes); + GENE_STRAND_TAG, GENE_FUNCTION_TAG, STRAND_STRATEGY, LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, + CELL_BARCODE_TAG, MOLECULAR_BARCODE_TAG, READ_MQ, true, cellBarcodes, false, false, + false, null); return (umiIterator); } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java index 9d9c706f..5fb50bc9 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java @@ -23,15 +23,20 @@ */ package org.broadinstitute.dropseqrna.barnyard.digitalexpression; +import htsjdk.samtools.util.IterableAdapter; +import org.broadinstitute.dropseqrna.barnyard.DigitalExpressionTest; +import org.broadinstitute.dropseqrna.barnyard.DigitalExpressionTestUtil; +import org.broadinstitute.dropseqrna.barnyard.GeneFunctionCommandLineBase; import org.broadinstitute.dropseqrna.utils.ObjectCounter; +import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil; +import org.broadinstitute.dropseqrna.utils.readiterators.UMIIterator; import org.testng.Assert; import org.testng.annotations.Test; +import picard.sam.util.Pair; +import picard.util.TabbedTextFileWithHeaderParser; import java.io.File; -import java.util.Collection; -import java.util.Random; - - +import java.util.*; public class UMICollectionTest { @@ -130,4 +135,31 @@ public void testDownsample() { testDownsampleRate(0.001, 1000000.0, 0.0005); } + // Do UMI collapse and confirm that the UMI counts are the same as DigitalExpressionTest.doWork() + @Test + public void testEditDistanceCollapse() { + final UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(DigitalExpressionTestUtil.IN_FILE), false), + DigitalExpressionTest.GENE_NAME_TAG, + DigitalExpressionTest.GENE_STRAND_TAG, DigitalExpressionTest.GENE_FUNCTION_TAG, DigitalExpressionTest.STRAND_STRATEGY, + DigitalExpressionTest.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, + "XC", DigitalExpressionTest.MOLECULAR_BARCODE_TAG, DigitalExpressionTest.READ_MQ, + false, Arrays.asList(DigitalExpressionTestUtil.barcodes), false, false, + true, null); + + final Map, Integer> umiCounts = new HashMap<>(); + for (final UMICollection c : new IterableAdapter(umiIterator)) { + c.collapseThisByEditDistance(1, DigitalExpressionTest.MOLECULAR_BARCODE_TAG); + final Pair key = new Pair<>(c.getCellBarcode(), c.getGeneName()); + umiCounts.put(key, c.getMolecularBarcodeCounts().getSize()); + } + final Map, Integer> expectedUmiCounts = new HashMap<>(); + final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(DigitalExpressionTest.EXPECTED_OUTFILE_LONG); + for (final TabbedTextFileWithHeaderParser.Row row: parser) { + final String cellBarcode = row.getField("CELL"); + final String geneName = row.getField("GENE"); + final int umiCount = row.getIntegerField("UMI_COUNT"); + expectedUmiCounts.put(new Pair<>(cellBarcode, geneName), umiCount); + } + Assert.assertEquals(umiCounts, expectedUmiCounts); + } } From 139bc61beec3acea73c11b68916ed96640ec92f1 Mon Sep 17 00:00:00 2001 From: Alec Wysoker Date: Tue, 2 Jul 2024 09:09:46 -0400 Subject: [PATCH 2/3] UMIIteratorBuilder --- .../barnyard/DigitalExpression.java | 6 +- ...herMolecularBarcodeDistributionByGene.java | 8 +- .../barnyard/MarkChimericReads.java | 6 +- .../barnyard/SelectCellsByNumTranscripts.java | 4 +- .../FilterReadsByUMISupport.java | 4 +- .../BiasedBarcodeCollectionFactory.java | 4 +- .../DetectBeadSynthesisErrors.java | 4 +- .../DetectBeadSubstitutionErrors.java | 4 +- .../utils/readiterators/UMIIterator.java | 168 +++++++++++------- .../barnyard/DigitalExpressionTest.java | 14 +- .../digitalexpression/UMICollectionTest.java | 7 +- 11 files changed, 132 insertions(+), 97 deletions(-) diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpression.java b/src/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpression.java index 4a2a3fec..c9ba0243 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpression.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpression.java @@ -186,10 +186,10 @@ private void digitalExpression(List cellBarcodes) { writeDgeHeader(out); // TODO should the ambiguous reads handling be a parameter? It's set to false by default for DGE to get rid of ambiguous gene assignments on reads - UMIIterator realUMIIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(this.INPUT), false), + UMIIterator realUMIIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(Collections.singletonList(this.INPUT), false), GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, - this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, false, cellBarcodes, - false, OMIT_MISSING_CELLS); + this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ).setCellBarcodes(cellBarcodes). + recordCellsInInput(OMIT_MISSING_CELLS).build(); CloseableIterator umiIterator = realUMIIterator; if (OMIT_MISSING_CELLS) { diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/GatherMolecularBarcodeDistributionByGene.java b/src/java/org/broadinstitute/dropseqrna/barnyard/GatherMolecularBarcodeDistributionByGene.java index 62f9f58b..701a87ac 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/GatherMolecularBarcodeDistributionByGene.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/GatherMolecularBarcodeDistributionByGene.java @@ -96,10 +96,10 @@ protected int doWork() { this.CELL_BC_FILE, this.READ_MQ, this.MIN_NUM_TRANSCRIPTS_PER_CELL, this.MIN_NUM_GENES_PER_CELL, this.MIN_NUM_READS_PER_CELL, this.NUM_CORE_BARCODES, this.EDIT_DISTANCE, this.MIN_BC_READ_THRESHOLD)); - UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false), + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false), GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, - this.READ_MQ, false, cellBarcodes, true, false); + this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build(); UMICollection batch; @@ -163,10 +163,10 @@ public ObjectCounter getNumTranscriptsPerCell (final List bamFile, SamReaderFactory factory= SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.EAGERLY_DECODE); SamHeaderAndIterator headerIterator= SamFileMergeUtil.mergeInputs(bamFile, false, factory); - UMIIterator umiIterator = new UMIIterator(headerIterator, + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerIterator, geneNameTag, strandTag, geneFunctionTag, strategy, locusFunctionList, functionStrategy, cellBarcodeTag, molecularBarcodeTag, - mapQuality, false, cellBarcodes); + mapQuality).setCellBarcodes(cellBarcodes).build(); ObjectCounter transcriptsPerCell = new ObjectCounter<>(); diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/MarkChimericReads.java b/src/java/org/broadinstitute/dropseqrna/barnyard/MarkChimericReads.java index 796f519f..551a070f 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/MarkChimericReads.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/MarkChimericReads.java @@ -46,10 +46,8 @@ import org.broadinstitute.dropseqrna.utils.readiterators.GeneFunctionProcessor; import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil; import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator; -import org.broadinstitute.dropseqrna.utils.readiterators.StrandStrategy; import org.broadinstitute.dropseqrna.utils.readiterators.UMIIterator; import picard.cmdline.StandardOptionDefinitions; -import picard.illumina.BarcodeMetric; import java.io.BufferedWriter; import java.io.File; @@ -224,10 +222,10 @@ private Map identifyChimericsAndWriteReport(boole final Set cellBarcodes=getCellBarcodes(); PeekableIterator umiIterator = new PeekableIterator<>( - new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false), + new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false), GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, - this.READ_MQ, false, cellBarcodes, true, false)); + this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build()); // Remember {CBC, UMI, Gene} pairs to be marked chimeric final Map chimerics = new HashMap<>(); diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/SelectCellsByNumTranscripts.java b/src/java/org/broadinstitute/dropseqrna/barnyard/SelectCellsByNumTranscripts.java index dddba987..0a100582 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/SelectCellsByNumTranscripts.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/SelectCellsByNumTranscripts.java @@ -131,9 +131,9 @@ protected int doWork() { mapContainer = new SingleOrganismMapContainer(cellBarcodes); // gene/exon tags are sorted first, followed by cells - UMIIterator umiIterator = new UMIIterator(headerAndIterator, GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerAndIterator, GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, - this.READ_MQ, false, cellBarcodes); + this.READ_MQ).setCellBarcodes(cellBarcodes).build(); String gene = null; diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/FilterReadsByUMISupport.java b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/FilterReadsByUMISupport.java index eb5ae7b2..06577b4d 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/FilterReadsByUMISupport.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalallelecounts/FilterReadsByUMISupport.java @@ -86,9 +86,9 @@ protected int doWork() { FilteredUmiMetrics metrics = new FilteredUmiMetrics(); - UMIIterator umiIterator = new UMIIterator(headerAndIter,GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerAndIter,GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, - this.READ_MQ, false, cellBarcodes, false, false, true, null); + this.READ_MQ).setCellBarcodes(cellBarcodes).retainReads(true).build(); while (umiIterator.hasNext()) { UMICollection c = umiIterator.next(); diff --git a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BiasedBarcodeCollectionFactory.java b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BiasedBarcodeCollectionFactory.java index 44615495..14ffa6ed 100644 --- a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BiasedBarcodeCollectionFactory.java +++ b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/BiasedBarcodeCollectionFactory.java @@ -53,11 +53,11 @@ public class BiasedBarcodeCollectionFactory { public UMIIterator prepareUMIIterator(final List inputFiles, final String geneExonTag, final String cellBarcodeTag, final String molBCTag, final String strandTag, final int readMQ, final List cellBarcodes) { - return new UMIIterator(SamFileMergeUtil.mergeInputs(inputFiles, false, samReaderFactory), + return new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(inputFiles, false, samReaderFactory), GeneFunctionCommandLineBase.DEFAULT_GENE_NAME_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_STRAND_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_FUNCTION_TAG, GeneFunctionCommandLineBase.DEFAULT_STRAND_STRATEGY, GeneFunctionCommandLineBase.DEFAULT_LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, - cellBarcodeTag, molBCTag, readMQ, false, cellBarcodes, true, false); + cellBarcodeTag, molBCTag, readMQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build(); } diff --git a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrors.java b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrors.java index ae9dafdc..24e6e9e2 100644 --- a/src/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrors.java +++ b/src/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrors.java @@ -588,10 +588,10 @@ public String fixUMI (final String cellBarcode, final String umi, final int erro public UMIIterator prepareUMIIterator() { List barcodes=getCellBarcodes(); - UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(INPUT, false, samReaderFactory), + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(INPUT, false, samReaderFactory), GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, - this.READ_MQ, false, barcodes, true, false); + this.READ_MQ).setCellBarcodes(barcodes).cellFirstSort(true).build(); return (umiIterator); } diff --git a/src/java/org/broadinstitute/dropseqrna/utils/editdistance/DetectBeadSubstitutionErrors.java b/src/java/org/broadinstitute/dropseqrna/utils/editdistance/DetectBeadSubstitutionErrors.java index f5217799..9e21e822 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/editdistance/DetectBeadSubstitutionErrors.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/editdistance/DetectBeadSubstitutionErrors.java @@ -140,12 +140,12 @@ protected int doWork() { // build up the UMI per cell data set. Collection cellBarcodes = null; - UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false), + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false), GeneFunctionCommandLineBase.DEFAULT_GENE_NAME_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_STRAND_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_FUNCTION_TAG, GeneFunctionCommandLineBase.DEFAULT_STRAND_STRATEGY, GeneFunctionCommandLineBase.DEFAULT_LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, - this.READ_MQ, false, cellBarcodes, true, false); + this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build(); // get list of barcodes that have enough UMIs, and are not polyT biased. UMIsPerCellResult umiResult=getUMIsPerCell(umiIterator, this.MIN_UMIS_PER_CELL, this.UMI_BIAS_BASE, this.UMI_BIAS_THRESHOLD, null); diff --git a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java index 619282a8..d706f392 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/UMIIterator.java @@ -49,72 +49,7 @@ public class UMIIterator implements CloseableIterator { private final StringInterner stringCache = new StringInterner(); private final Set cellBarcodesSeen; private final boolean retainReads; - /** - * Construct an object that generates UMI objects from a BAM file - * @param headerAndIterator The BAM records to extract UMIs from - * @param geneTag The gene tag on BAM records - * @param cellBarcodeTag The cell barcode tag on BAM records - * @param molecularBarcodeTag The molecular barcode tag on BAM records - * @param geneStrandTag The strand tag on BAM records - * @param readMQ The minimum map quality of the reads - * @param assignReadsToAllGenes Should records tagged with multiple genes be double counted, once for each gene? - * @param strandStrategy should the gene and read strand match for the read to be accepted - * @param cellBarcodes The list of cell barcode tag values that match the tag on the BAM records. - * Only reads with these values will be used. If set to null, all cell barcodes are used. - */ - public UMIIterator(final SamHeaderAndIterator headerAndIterator, - final String geneTag, - final String geneStrandTag, - final String geneFunctionTag, - final StrandStrategy strandStrategy, - final Collection acceptedLociFunctions, - FunctionalDataProcessorStrategy functionStrategy, - final String cellBarcodeTag, - final String molecularBarcodeTag, - final int readMQ, - final boolean assignReadsToAllGenes, - final Collection cellBarcodes) { - this(headerAndIterator, geneTag, geneStrandTag, geneFunctionTag, strandStrategy, acceptedLociFunctions, functionStrategy, - cellBarcodeTag, molecularBarcodeTag, readMQ, assignReadsToAllGenes, cellBarcodes, false, false); - } - /** - * Construct an object that generates UMI objects from a BAM file - * @param headerAndIterator The BAM records to extract UMIs from - * @param geneTag The geneExon tag on BAM records - * @param cellBarcodeTag The cell barcode tag on BAM records - * @param molecularBarcodeTag The molecular barcode tag on BAM records - * @param geneStrandTag The strand tag on BAM records - * @param readMQ The minimum map quality of the reads - * @param assignReadsToAllGenes Should records tagged with multiple genes be double counted, once for each gene? - * @param strandStrategy should the gene and read strand match for the read to be accepted - * @param cellBarcodes The list of cell barcode tag values that match the tag on the BAM records. - * Only reads with these values will be used. If set to null, all cell barcodes are used. - * @param cellFirstSort if true, then cell barcodes are sorted first, followed by gene/exon tags. - * If false, then gene/exon tags are sorted first, followed by cells. false is the default and used in the other constructor. - * @param recordCellsInInput While sorting the input, keep track of what cells appear in the input. This record - * is not complete until iteration is started. - */ - public UMIIterator(final SamHeaderAndIterator headerAndIterator, - final String geneTag, - final String geneStrandTag, - final String geneFunctionTag, - final StrandStrategy strandStrategy, - final Collection acceptedLociFunctions, - FunctionalDataProcessorStrategy functionStrategy, - final String cellBarcodeTag, - final String molecularBarcodeTag, - final int readMQ, - final boolean assignReadsToAllGenes, - final Collection cellBarcodes, - final boolean cellFirstSort, - final boolean recordCellsInInput) { - - this(headerAndIterator, geneTag, geneStrandTag, geneFunctionTag, strandStrategy, acceptedLociFunctions, functionStrategy, - cellBarcodeTag, molecularBarcodeTag, readMQ, assignReadsToAllGenes, cellBarcodes, cellFirstSort, recordCellsInInput, false, - null); - - } /** * Construct an object that generates UMI objects from a BAM file * @param headerAndIterator The BAM records to extract UMIs from @@ -135,7 +70,7 @@ public UMIIterator(final SamHeaderAndIterator headerAndIterator, * This is false in other method signatures by default. If false, reads can be simplified for faster serialization. * */ - public UMIIterator(final SamHeaderAndIterator headerAndIterator, + private UMIIterator(final SamHeaderAndIterator headerAndIterator, final String geneTag, final String geneStrandTag, final String geneFunctionTag, @@ -376,4 +311,105 @@ public int compare(SAMRecord o1, SAMRecord o2) { return comp.fileOrderCompare(o1, o2); } } + + public static class UMIIteratorBuilder { + // required parameters + private final SamHeaderAndIterator headerAndIterator; + private final String geneTag; + private final String geneStrandTag; + private final String geneFunctionTag; + private final StrandStrategy strandStrategy; + private final Collection acceptedLociFunctions; + private final FunctionalDataProcessorStrategy functionStrategy; + private final String cellBarcodeTag; + private final String molecularBarcodeTag; + private final int readMQ; + + // parameters with default values + private boolean assignReadsToAllGenes=false; + private boolean cellFirstSort=false; + private boolean recordCellsInInput=false; + private boolean retainReads=false; + + // nullable parameters + private Collection cellBarcodes; + private IntervalList intervals; + + public UMIIteratorBuilder(SamHeaderAndIterator headerAndIterator, String geneTag, String geneStrandTag, + String geneFunctionTag, StrandStrategy strandStrategy, + Collection acceptedLociFunctions, + FunctionalDataProcessorStrategy functionStrategy, String cellBarcodeTag, + String molecularBarcodeTag, int readMQ) { + this.headerAndIterator = headerAndIterator; + this.geneTag = geneTag; + this.geneStrandTag = geneStrandTag; + this.geneFunctionTag = geneFunctionTag; + this.strandStrategy = strandStrategy; + this.acceptedLociFunctions = acceptedLociFunctions; + this.functionStrategy = functionStrategy; + this.cellBarcodeTag = cellBarcodeTag; + this.molecularBarcodeTag = molecularBarcodeTag; + this.readMQ = readMQ; + } + + public boolean isAssignReadsToAllGenes() { + return assignReadsToAllGenes; + } + + public UMIIteratorBuilder assignReadsToAllGenes(boolean assignReadsToAllGenes) { + this.assignReadsToAllGenes = assignReadsToAllGenes; + return this; + } + + public boolean isCellFirstSort() { + return cellFirstSort; + } + + public UMIIteratorBuilder cellFirstSort(boolean cellFirstSort) { + this.cellFirstSort = cellFirstSort; + return this; + } + + public boolean isRecordCellsInInput() { + return recordCellsInInput; + } + + public UMIIteratorBuilder recordCellsInInput(boolean recordCellsInInput) { + this.recordCellsInInput = recordCellsInInput; + return this; + } + + public boolean isRetainReads() { + return retainReads; + } + + public UMIIteratorBuilder retainReads(boolean retainReads) { + this.retainReads = retainReads; + return this; + } + + public Collection getCellBarcodes() { + return cellBarcodes; + } + + public UMIIteratorBuilder setCellBarcodes(Collection cellBarcodes) { + this.cellBarcodes = cellBarcodes; + return this; + } + + public IntervalList getIntervals() { + return intervals; + } + + public UMIIteratorBuilder setIntervals(IntervalList intervals) { + this.intervals = intervals; + return this; + } + + public UMIIterator build() { + return new UMIIterator(headerAndIterator, geneTag, geneStrandTag, geneFunctionTag, strandStrategy, + acceptedLociFunctions, functionStrategy, cellBarcodeTag, molecularBarcodeTag, readMQ, + assignReadsToAllGenes, cellBarcodes, cellFirstSort, recordCellsInInput, retainReads, intervals); + } + } } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java index f8a9255f..4bc0d361 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java @@ -73,10 +73,9 @@ public class DigitalExpressionTest { private static UMIIterator getUMIIterator (final File inFile) { List cellBarcodes = Arrays.asList(DigitalExpressionTestUtil.barcodes); - UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG, + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, STRAND_STRATEGY, LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, - CELL_BARCODE_TAG, MOLECULAR_BARCODE_TAG, READ_MQ, true, cellBarcodes, false, false, - false, null); + CELL_BARCODE_TAG, MOLECULAR_BARCODE_TAG, READ_MQ).assignReadsToAllGenes(true).setCellBarcodes(cellBarcodes).build(); return (umiIterator); } @@ -202,13 +201,16 @@ public void DGEIntegrationTest () { Assert.assertNotEquals(count,0); } + @Test public void testTwoGenesOnSameStrand () { List barcodes = Collections.singletonList("FOO"); File inFile = new File (""); - UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG, - GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, - this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, true, barcodes); + UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder( + SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG, + GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, + GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, + this.MOLECULAR_BARCODE_TAG, this.READ_MQ).setAssignReadsToAllGenes(true).setCellBarcodes(barcodes).build(); } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java index 5fb50bc9..c5c96ef4 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java @@ -138,13 +138,12 @@ public void testDownsample() { // Do UMI collapse and confirm that the UMI counts are the same as DigitalExpressionTest.doWork() @Test public void testEditDistanceCollapse() { - final UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(DigitalExpressionTestUtil.IN_FILE), false), + final UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(Collections.singletonList(DigitalExpressionTestUtil.IN_FILE), false), DigitalExpressionTest.GENE_NAME_TAG, DigitalExpressionTest.GENE_STRAND_TAG, DigitalExpressionTest.GENE_FUNCTION_TAG, DigitalExpressionTest.STRAND_STRATEGY, DigitalExpressionTest.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, - "XC", DigitalExpressionTest.MOLECULAR_BARCODE_TAG, DigitalExpressionTest.READ_MQ, - false, Arrays.asList(DigitalExpressionTestUtil.barcodes), false, false, - true, null); + "XC", DigitalExpressionTest.MOLECULAR_BARCODE_TAG, DigitalExpressionTest.READ_MQ). + setCellBarcodes(Arrays.asList(DigitalExpressionTestUtil.barcodes)).retainReads(true).build(); final Map, Integer> umiCounts = new HashMap<>(); for (final UMICollection c : new IterableAdapter(umiIterator)) { From d8eeac29af0f1c97edcadfe03438dec4caeda0a3 Mon Sep 17 00:00:00 2001 From: Alec Wysoker Date: Tue, 2 Jul 2024 10:01:56 -0400 Subject: [PATCH 3/3] Remove test that appears never to have been implemented --- .../barnyard/DigitalExpressionTest.java | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java index 4bc0d361..33a72443 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java @@ -201,22 +201,6 @@ public void DGEIntegrationTest () { Assert.assertNotEquals(count,0); } - @Test - public void testTwoGenesOnSameStrand () { - List barcodes = Collections.singletonList("FOO"); - File inFile = new File (""); - - UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder( - SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG, - GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, - GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, - this.MOLECULAR_BARCODE_TAG, this.READ_MQ).setAssignReadsToAllGenes(true).setCellBarcodes(barcodes).build(); - } - - - - - //GET COUNTS OF READS // ~/samtools view -q 10 5cell3gene.bam |grep ZC:Z:ATCAGGGACAGA |grep HUMAN_3:42642106-42690227:NKTR | sed -E 's/.*XM private int getReadCounts (final String gene, final String cell) {