Skip to content

Commit

Permalink
Add ability to collapse UMIs in UMICollection by edit distance in place
Browse files Browse the repository at this point in the history
  • Loading branch information
alecw committed Jul 1, 2024
1 parent 9466915 commit 3df3e34
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,7 @@
package org.broadinstitute.dropseqrna.barnyard.digitalexpression;

import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.*;

import org.broadinstitute.dropseqrna.barnyard.GatherMolecularBarcodeDistributionByGene;
import org.broadinstitute.dropseqrna.utils.ObjectCounter;
Expand Down Expand Up @@ -158,14 +152,45 @@ public int getDigitalExpression (final int minBCReadThreshold, final int editDis
* For a list of molecular barcodes, collapse them by edit distance.
* @param counts
* @param editDistance
* @param threshold
* @return
* @return A new object counter with the molecular barcodes collapsed by edit distance.
*/
private ObjectCounter<String> collapseByEditDistance (final ObjectCounter<String> counts, final int editDistance) {
ObjectCounter<String> result = mbed.collapseAndMergeBarcodes(counts, false, editDistance);
return (result);
}

/**
* Collapse the molecular barcodes in this collection by edit distance. This only makes sense if reads != null,
* because otherwise, one could just call getMolecularBarcodeCountsCollapsed to get collapsed read counts.
* The reads are re-tagged with the collapsed molecular barcode, and re-grouped, and the molecular barcode counts are updated.
* @param editDistance
* @param molecularBarcodeTag
*/
public void collapseThisByEditDistance(final int editDistance, final String molecularBarcodeTag) {
if (reads == null) {
throw new IllegalStateException("It doesn't make sense to collapse in place without reads");
}
if (editDistance < 1) {
throw new IllegalArgumentException("Edit distance must be at least 1");
}
Map<String, List<String>> collapsedToUncollapsed = mbed.collapseBarcodes(this.molecularBarcodeCounts, false, editDistance);
for (Map.Entry<String, List<String>> entry: collapsedToUncollapsed.entrySet()) {
final String goodUmi = entry.getKey();
final List<String> umisToBeCollapsed = entry.getValue();
final List<SAMRecord> readsForGoodUmi = reads.get(goodUmi);
for (final String umiToBeCollapsed : umisToBeCollapsed) {
final List<SAMRecord> readsToBeCollapsed = reads.get(umiToBeCollapsed);
readsToBeCollapsed.stream().forEach(r -> r.setAttribute(molecularBarcodeTag, goodUmi));
readsForGoodUmi.addAll(readsToBeCollapsed);
reads.remove(umiToBeCollapsed);
}
}
molecularBarcodeCounts = new ObjectCounter<>();
for (Map.Entry<String, List<SAMRecord>> entry: this.reads.entrySet()) {
molecularBarcodeCounts.setCount(entry.getKey(), entry.getValue().size());
}
}

public static Collection<UMICollection> parseUMICollectionFile (final File input) {
IOUtil.assertFileIsReadable(input);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ private boolean assertAllArraysSameLength (final Collection<char []> charArrays)
* @param barcodes
* @param findIndels
* @param editDistance
* @return
* @return map from the core barcode to the list of barcodes that are collapsed into it.
*/
public Map<String, List<String>> collapseBarcodes (final ObjectCounter<String> barcodes, final boolean findIndels, final int editDistance) {
List<String> coreBarcodes = barcodes.getKeysOrderedByCount(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ public class DigitalExpressionTestUtil {
*
*/

static final File IN_FILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene_retagged.bam");
static final String [] barcodes ={"ATCAGGGACAGA", "AGGGAAAATTGA", "TTGCCTTACGCG", "TGGCGAAGAGAT", "TACAATTAAGGC"};
public static final File IN_FILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene_retagged.bam");
public static final String [] barcodes ={"ATCAGGGACAGA", "AGGGAAAATTGA", "TTGCCTTACGCG", "TGGCGAAGAGAT", "TACAATTAAGGC"};

/**
* @return A digital expression file, which is marked as deleteOnExit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,29 @@ public class DigitalExpressionTest {
// expected results for standard coding strand specific DGE
private static final File EXPECTED_OUTFILE = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge.txt");
private static final File EXPECTED_OUTFILE_SUMMARY = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_summary.txt");
private static final File EXPECTED_OUTFILE_LONG = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_long.txt");
public static final File EXPECTED_OUTFILE_LONG = new File("testdata/org/broadinstitute/transcriptome/barnyard/5cell3gene.dge_long.txt");

private static final File EXPECTED_OUTPUT_SINGLE_CELL=new File("testdata/org/broadinstitute/transcriptome/barnyard/1_cell.dge.txt");
//

private String GENE_NAME_TAG="gn";
private String GENE_STRAND_TAG="gs";
private String GENE_FUNCTION_TAG="gf";
private StrandStrategy STRAND_STRATEGY = StrandStrategy.SENSE;
private List<LocusFunction> LOCUS_FUNCTION_LIST=new ArrayList<>(Arrays.asList(LocusFunction.CODING, LocusFunction.UTR));
private String CELL_BARCODE_TAG="ZC";
private String MOLECULAR_BARCODE_TAG = "XM";
private int READ_MQ=10;
private Boolean USE_STRAND_INFO=true;
public static final String GENE_NAME_TAG="gn";
public static final String GENE_STRAND_TAG="gs";
public static final String GENE_FUNCTION_TAG="gf";
public static final StrandStrategy STRAND_STRATEGY = StrandStrategy.SENSE;
public static final List<LocusFunction> LOCUS_FUNCTION_LIST=new ArrayList<>(Arrays.asList(LocusFunction.CODING, LocusFunction.UTR));
public static final String CELL_BARCODE_TAG="ZC";
public static final String MOLECULAR_BARCODE_TAG = "XM";
public static final int READ_MQ=10;
private static final Boolean USE_STRAND_INFO=true;


private UMIIterator getUMIIterator (final File inFile) {
private static UMIIterator getUMIIterator (final File inFile) {
List<String> cellBarcodes = Arrays.asList(DigitalExpressionTestUtil.barcodes);

UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG,
GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, true, cellBarcodes);
GENE_STRAND_TAG, GENE_FUNCTION_TAG, STRAND_STRATEGY, LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
CELL_BARCODE_TAG, MOLECULAR_BARCODE_TAG, READ_MQ, true, cellBarcodes, false, false,
false, null);

return (umiIterator);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,20 @@
*/
package org.broadinstitute.dropseqrna.barnyard.digitalexpression;

import htsjdk.samtools.util.IterableAdapter;
import org.broadinstitute.dropseqrna.barnyard.DigitalExpressionTest;
import org.broadinstitute.dropseqrna.barnyard.DigitalExpressionTestUtil;
import org.broadinstitute.dropseqrna.barnyard.GeneFunctionCommandLineBase;
import org.broadinstitute.dropseqrna.utils.ObjectCounter;
import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil;
import org.broadinstitute.dropseqrna.utils.readiterators.UMIIterator;
import org.testng.Assert;
import org.testng.annotations.Test;
import picard.sam.util.Pair;
import picard.util.TabbedTextFileWithHeaderParser;

import java.io.File;
import java.util.Collection;
import java.util.Random;


import java.util.*;


public class UMICollectionTest {
Expand Down Expand Up @@ -130,4 +135,31 @@ public void testDownsample() {
testDownsampleRate(0.001, 1000000.0, 0.0005);
}

// Do UMI collapse and confirm that the UMI counts are the same as DigitalExpressionTest.doWork()
@Test
public void testEditDistanceCollapse() {
final UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(DigitalExpressionTestUtil.IN_FILE), false),
DigitalExpressionTest.GENE_NAME_TAG,
DigitalExpressionTest.GENE_STRAND_TAG, DigitalExpressionTest.GENE_FUNCTION_TAG, DigitalExpressionTest.STRAND_STRATEGY,
DigitalExpressionTest.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
"XC", DigitalExpressionTest.MOLECULAR_BARCODE_TAG, DigitalExpressionTest.READ_MQ,
false, Arrays.asList(DigitalExpressionTestUtil.barcodes), false, false,
true, null);

final Map<Pair<String, String>, Integer> umiCounts = new HashMap<>();
for (final UMICollection c : new IterableAdapter<UMICollection>(umiIterator)) {
c.collapseThisByEditDistance(1, DigitalExpressionTest.MOLECULAR_BARCODE_TAG);
final Pair<String, String> key = new Pair<>(c.getCellBarcode(), c.getGeneName());
umiCounts.put(key, c.getMolecularBarcodeCounts().getSize());
}
final Map<Pair<String, String>, Integer> expectedUmiCounts = new HashMap<>();
final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(DigitalExpressionTest.EXPECTED_OUTFILE_LONG);
for (final TabbedTextFileWithHeaderParser.Row row: parser) {
final String cellBarcode = row.getField("CELL");
final String geneName = row.getField("GENE");
final int umiCount = row.getIntegerField("UMI_COUNT");
expectedUmiCounts.put(new Pair<>(cellBarcode, geneName), umiCount);
}
Assert.assertEquals(umiCounts, expectedUmiCounts);
}
}

0 comments on commit 3df3e34

Please sign in to comment.