From 3df3e34bacf591eab34e012419c06798c4ebd9f6 Mon Sep 17 00:00:00 2001 From: Alec Wysoker Date: Thu, 27 Jun 2024 17:03:55 -0400 Subject: [PATCH] Add ability to collapse UMIs in UMICollection by edit distance in place --- .../digitalexpression/UMICollection.java | 43 +++++++++++++++---- .../MapBarcodesByEditDistance.java | 2 +- .../barnyard/DigitalExpressionTestUtil.java | 4 +- .../barnyard/DigitalExpressionTest.java | 27 ++++++------ .../digitalexpression/UMICollectionTest.java | 40 +++++++++++++++-- 5 files changed, 87 insertions(+), 29 deletions(-) diff --git a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java index de0dc8d7..e456a570 100644 --- a/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java +++ b/src/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollection.java @@ -24,13 +24,7 @@ package org.broadinstitute.dropseqrna.barnyard.digitalexpression; import java.io.File; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; +import java.util.*; import org.broadinstitute.dropseqrna.barnyard.GatherMolecularBarcodeDistributionByGene; import org.broadinstitute.dropseqrna.utils.ObjectCounter; @@ -158,14 +152,45 @@ public int getDigitalExpression (final int minBCReadThreshold, final int editDis * For a list of molecular barcodes, collapse them by edit distance. * @param counts * @param editDistance - * @param threshold - * @return + * @return A new object counter with the molecular barcodes collapsed by edit distance. */ private ObjectCounter collapseByEditDistance (final ObjectCounter counts, final int editDistance) { ObjectCounter result = mbed.collapseAndMergeBarcodes(counts, false, editDistance); return (result); } + /** + * Collapse the molecular barcodes in this collection by edit distance. This only makes sense if reads != null, + * because otherwise, one could just call getMolecularBarcodeCountsCollapsed to get collapsed read counts. + * The reads are re-tagged with the collapsed molecular barcode, and re-grouped, and the molecular barcode counts are updated. + * @param editDistance + * @param molecularBarcodeTag + */ + public void collapseThisByEditDistance(final int editDistance, final String molecularBarcodeTag) { + if (reads == null) { + throw new IllegalStateException("It doesn't make sense to collapse in place without reads"); + } + if (editDistance < 1) { + throw new IllegalArgumentException("Edit distance must be at least 1"); + } + Map> collapsedToUncollapsed = mbed.collapseBarcodes(this.molecularBarcodeCounts, false, editDistance); + for (Map.Entry> entry: collapsedToUncollapsed.entrySet()) { + final String goodUmi = entry.getKey(); + final List umisToBeCollapsed = entry.getValue(); + final List readsForGoodUmi = reads.get(goodUmi); + for (final String umiToBeCollapsed : umisToBeCollapsed) { + final List readsToBeCollapsed = reads.get(umiToBeCollapsed); + readsToBeCollapsed.stream().forEach(r -> r.setAttribute(molecularBarcodeTag, goodUmi)); + readsForGoodUmi.addAll(readsToBeCollapsed); + reads.remove(umiToBeCollapsed); + } + } + molecularBarcodeCounts = new ObjectCounter<>(); + for (Map.Entry> entry: this.reads.entrySet()) { + molecularBarcodeCounts.setCount(entry.getKey(), entry.getValue().size()); + } + } + public static Collection parseUMICollectionFile (final File input) { IOUtil.assertFileIsReadable(input); diff --git a/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java b/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java index 9536e01c..ac4b7958 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/editdistance/MapBarcodesByEditDistance.java @@ -239,7 +239,7 @@ private boolean assertAllArraysSameLength (final Collection charArrays) * @param barcodes * @param findIndels * @param editDistance - * @return + * @return map from the core barcode to the list of barcodes that are collapsed into it. */ public Map> collapseBarcodes (final ObjectCounter barcodes, final boolean findIndels, final int editDistance) { List coreBarcodes = barcodes.getKeysOrderedByCount(true); diff --git a/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java b/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java index 39de60ca..fbfa932c 100644 --- a/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java +++ b/src/testFixtures/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTestUtil.java @@ -58,8 +58,8 @@ public class DigitalExpressionTestUtil { * */ - static final File IN_FILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene_retagged.bam"); - static final String [] barcodes ={"ATCAGGGACAGA", "AGGGAAAATTGA", "TTGCCTTACGCG", "TGGCGAAGAGAT", "TACAATTAAGGC"}; + public static final File IN_FILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene_retagged.bam"); + public static final String [] barcodes ={"ATCAGGGACAGA", "AGGGAAAATTGA", "TTGCCTTACGCG", "TGGCGAAGAGAT", "TACAATTAAGGC"}; /** * @return A digital expression file, which is marked as deleteOnExit diff --git a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java index 8bb239ff..f8a9255f 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/DigitalExpressionTest.java @@ -54,28 +54,29 @@ public class DigitalExpressionTest { // expected results for standard coding strand specific DGE private static final File EXPECTED_OUTFILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge.txt"); private static final File EXPECTED_OUTFILE_SUMMARY = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_summary.txt"); - private static final File EXPECTED_OUTFILE_LONG = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_long.txt"); + public static final File EXPECTED_OUTFILE_LONG = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_long.txt"); private static final File EXPECTED_OUTPUT_SINGLE_CELL=new File("testdata/org/broadinstitute/transcriptome/barnyard/1_cell.dge.txt"); // - private String GENE_NAME_TAG="gn"; - private String GENE_STRAND_TAG="gs"; - private String GENE_FUNCTION_TAG="gf"; - private StrandStrategy STRAND_STRATEGY = StrandStrategy.SENSE; - private List LOCUS_FUNCTION_LIST=new ArrayList<>(Arrays.asList(LocusFunction.CODING, LocusFunction.UTR)); - private String CELL_BARCODE_TAG="ZC"; - private String MOLECULAR_BARCODE_TAG = "XM"; - private int READ_MQ=10; - private Boolean USE_STRAND_INFO=true; + public static final String GENE_NAME_TAG="gn"; + public static final String GENE_STRAND_TAG="gs"; + public static final String GENE_FUNCTION_TAG="gf"; + public static final StrandStrategy STRAND_STRATEGY = StrandStrategy.SENSE; + public static final List LOCUS_FUNCTION_LIST=new ArrayList<>(Arrays.asList(LocusFunction.CODING, LocusFunction.UTR)); + public static final String CELL_BARCODE_TAG="ZC"; + public static final String MOLECULAR_BARCODE_TAG = "XM"; + public static final int READ_MQ=10; + private static final Boolean USE_STRAND_INFO=true; - private UMIIterator getUMIIterator (final File inFile) { + private static UMIIterator getUMIIterator (final File inFile) { List cellBarcodes = Arrays.asList(DigitalExpressionTestUtil.barcodes); UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG, - GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, - this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, true, cellBarcodes); + GENE_STRAND_TAG, GENE_FUNCTION_TAG, STRAND_STRATEGY, LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, + CELL_BARCODE_TAG, MOLECULAR_BARCODE_TAG, READ_MQ, true, cellBarcodes, false, false, + false, null); return (umiIterator); } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java index 9d9c706f..5fb50bc9 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/barnyard/digitalexpression/UMICollectionTest.java @@ -23,15 +23,20 @@ */ package org.broadinstitute.dropseqrna.barnyard.digitalexpression; +import htsjdk.samtools.util.IterableAdapter; +import org.broadinstitute.dropseqrna.barnyard.DigitalExpressionTest; +import org.broadinstitute.dropseqrna.barnyard.DigitalExpressionTestUtil; +import org.broadinstitute.dropseqrna.barnyard.GeneFunctionCommandLineBase; import org.broadinstitute.dropseqrna.utils.ObjectCounter; +import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil; +import org.broadinstitute.dropseqrna.utils.readiterators.UMIIterator; import org.testng.Assert; import org.testng.annotations.Test; +import picard.sam.util.Pair; +import picard.util.TabbedTextFileWithHeaderParser; import java.io.File; -import java.util.Collection; -import java.util.Random; - - +import java.util.*; public class UMICollectionTest { @@ -130,4 +135,31 @@ public void testDownsample() { testDownsampleRate(0.001, 1000000.0, 0.0005); } + // Do UMI collapse and confirm that the UMI counts are the same as DigitalExpressionTest.doWork() + @Test + public void testEditDistanceCollapse() { + final UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(DigitalExpressionTestUtil.IN_FILE), false), + DigitalExpressionTest.GENE_NAME_TAG, + DigitalExpressionTest.GENE_STRAND_TAG, DigitalExpressionTest.GENE_FUNCTION_TAG, DigitalExpressionTest.STRAND_STRATEGY, + DigitalExpressionTest.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, + "XC", DigitalExpressionTest.MOLECULAR_BARCODE_TAG, DigitalExpressionTest.READ_MQ, + false, Arrays.asList(DigitalExpressionTestUtil.barcodes), false, false, + true, null); + + final Map, Integer> umiCounts = new HashMap<>(); + for (final UMICollection c : new IterableAdapter(umiIterator)) { + c.collapseThisByEditDistance(1, DigitalExpressionTest.MOLECULAR_BARCODE_TAG); + final Pair key = new Pair<>(c.getCellBarcode(), c.getGeneName()); + umiCounts.put(key, c.getMolecularBarcodeCounts().getSize()); + } + final Map, Integer> expectedUmiCounts = new HashMap<>(); + final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(DigitalExpressionTest.EXPECTED_OUTFILE_LONG); + for (final TabbedTextFileWithHeaderParser.Row row: parser) { + final String cellBarcode = row.getField("CELL"); + final String geneName = row.getField("GENE"); + final int umiCount = row.getIntegerField("UMI_COUNT"); + expectedUmiCounts.put(new Pair<>(cellBarcode, geneName), umiCount); + } + Assert.assertEquals(umiCounts, expectedUmiCounts); + } }