Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to collapse UMIs in UMICollection by edit distance in place #432

Merged
merged 3 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ private void digitalExpression(List<String> cellBarcodes) {
writeDgeHeader(out);

// TODO should the ambiguous reads handling be a parameter? It's set to false by default for DGE to get rid of ambiguous gene assignments on reads
UMIIterator realUMIIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(this.INPUT), false),
UMIIterator realUMIIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(Collections.singletonList(this.INPUT), false),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY,
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, false, cellBarcodes,
false, OMIT_MISSING_CELLS);
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ).setCellBarcodes(cellBarcodes).
recordCellsInInput(OMIT_MISSING_CELLS).build();
CloseableIterator<UMICollection> umiIterator = realUMIIterator;

if (OMIT_MISSING_CELLS) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,10 @@ protected int doWork() {
this.CELL_BC_FILE, this.READ_MQ, this.MIN_NUM_TRANSCRIPTS_PER_CELL,
this.MIN_NUM_GENES_PER_CELL, this.MIN_NUM_READS_PER_CELL, this.NUM_CORE_BARCODES, this.EDIT_DISTANCE, this.MIN_BC_READ_THRESHOLD));

UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false),
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, true, false);
this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build();

UMICollection batch;

Expand Down Expand Up @@ -163,10 +163,10 @@ public ObjectCounter<String> getNumTranscriptsPerCell (final List<File> bamFile,
SamReaderFactory factory= SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.EAGERLY_DECODE);
SamHeaderAndIterator headerIterator= SamFileMergeUtil.mergeInputs(bamFile, false, factory);

UMIIterator umiIterator = new UMIIterator(headerIterator,
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerIterator,
geneNameTag, strandTag, geneFunctionTag,
strategy, locusFunctionList, functionStrategy, cellBarcodeTag, molecularBarcodeTag,
mapQuality, false, cellBarcodes);
mapQuality).setCellBarcodes(cellBarcodes).build();

ObjectCounter<String> transcriptsPerCell = new ObjectCounter<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@
import org.broadinstitute.dropseqrna.utils.readiterators.GeneFunctionProcessor;
import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil;
import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator;
import org.broadinstitute.dropseqrna.utils.readiterators.StrandStrategy;
import org.broadinstitute.dropseqrna.utils.readiterators.UMIIterator;
import picard.cmdline.StandardOptionDefinitions;
import picard.illumina.BarcodeMetric;

import java.io.BufferedWriter;
import java.io.File;
Expand Down Expand Up @@ -224,10 +222,10 @@ private Map<String, ChimericUmiCollection> identifyChimericsAndWriteReport(boole
final Set<String> cellBarcodes=getCellBarcodes();

PeekableIterator<UMICollection> umiIterator = new PeekableIterator<>(
new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false),
new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, true, false));
this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build());

// Remember {CBC, UMI, Gene} pairs to be marked chimeric
final Map<String, ChimericUmiCollection> chimerics = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ protected int doWork() {
mapContainer = new SingleOrganismMapContainer(cellBarcodes);

// gene/exon tags are sorted first, followed by cells
UMIIterator umiIterator = new UMIIterator(headerAndIterator, GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerAndIterator, GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes);
this.READ_MQ).setCellBarcodes(cellBarcodes).build();


String gene = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ protected int doWork() {

FilteredUmiMetrics metrics = new FilteredUmiMetrics();

UMIIterator umiIterator = new UMIIterator(headerAndIter,GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerAndIter,GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, false, false, true, null);
this.READ_MQ).setCellBarcodes(cellBarcodes).retainReads(true).build();

while (umiIterator.hasNext()) {
UMICollection c = umiIterator.next();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,7 @@
package org.broadinstitute.dropseqrna.barnyard.digitalexpression;

import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.*;

import org.broadinstitute.dropseqrna.barnyard.GatherMolecularBarcodeDistributionByGene;
import org.broadinstitute.dropseqrna.utils.ObjectCounter;
Expand Down Expand Up @@ -158,14 +152,45 @@ public int getDigitalExpression (final int minBCReadThreshold, final int editDis
* For a list of molecular barcodes, collapse them by edit distance.
* @param counts
* @param editDistance
* @param threshold
* @return
* @return A new object counter with the molecular barcodes collapsed by edit distance.
*/
private ObjectCounter<String> collapseByEditDistance (final ObjectCounter<String> counts, final int editDistance) {
ObjectCounter<String> result = mbed.collapseAndMergeBarcodes(counts, false, editDistance);
return (result);
}

/**
* Collapse the molecular barcodes in this collection by edit distance. This only makes sense if reads != null,
* because otherwise, one could just call getMolecularBarcodeCountsCollapsed to get collapsed read counts.
* The reads are re-tagged with the collapsed molecular barcode, and re-grouped, and the molecular barcode counts are updated.
* @param editDistance
* @param molecularBarcodeTag
*/
public void collapseThisByEditDistance(final int editDistance, final String molecularBarcodeTag) {
if (reads == null) {
throw new IllegalStateException("It doesn't make sense to collapse in place without reads");
}
if (editDistance < 1) {
throw new IllegalArgumentException("Edit distance must be at least 1");
}
Map<String, List<String>> collapsedToUncollapsed = mbed.collapseBarcodes(this.molecularBarcodeCounts, false, editDistance);
for (Map.Entry<String, List<String>> entry: collapsedToUncollapsed.entrySet()) {
final String goodUmi = entry.getKey();
final List<String> umisToBeCollapsed = entry.getValue();
final List<SAMRecord> readsForGoodUmi = reads.get(goodUmi);
for (final String umiToBeCollapsed : umisToBeCollapsed) {
final List<SAMRecord> readsToBeCollapsed = reads.get(umiToBeCollapsed);
readsToBeCollapsed.stream().forEach(r -> r.setAttribute(molecularBarcodeTag, goodUmi));
readsForGoodUmi.addAll(readsToBeCollapsed);
reads.remove(umiToBeCollapsed);
}
}
molecularBarcodeCounts = new ObjectCounter<>();
for (Map.Entry<String, List<SAMRecord>> entry: this.reads.entrySet()) {
molecularBarcodeCounts.setCount(entry.getKey(), entry.getValue().size());
}
}

public static Collection<UMICollection> parseUMICollectionFile (final File input) {
IOUtil.assertFileIsReadable(input);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ public class BiasedBarcodeCollectionFactory {
public UMIIterator prepareUMIIterator(final List<File> inputFiles, final String geneExonTag, final String cellBarcodeTag, final String molBCTag, final String strandTag,
final int readMQ, final List<String> cellBarcodes) {

return new UMIIterator(SamFileMergeUtil.mergeInputs(inputFiles, false, samReaderFactory),
return new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(inputFiles, false, samReaderFactory),
GeneFunctionCommandLineBase.DEFAULT_GENE_NAME_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_STRAND_TAG,
GeneFunctionCommandLineBase.DEFAULT_GENE_FUNCTION_TAG, GeneFunctionCommandLineBase.DEFAULT_STRAND_STRATEGY,
GeneFunctionCommandLineBase.DEFAULT_LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
cellBarcodeTag, molBCTag, readMQ, false, cellBarcodes, true, false);
cellBarcodeTag, molBCTag, readMQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build();

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -588,10 +588,10 @@ public String fixUMI (final String cellBarcode, final String umi, final int erro
public UMIIterator prepareUMIIterator() {
List<String> barcodes=getCellBarcodes();

UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(INPUT, false, samReaderFactory),
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(INPUT, false, samReaderFactory),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, barcodes, true, false);
this.READ_MQ).setCellBarcodes(barcodes).cellFirstSort(true).build();

return (umiIterator);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,12 @@ protected int doWork() {

// build up the UMI per cell data set.
Collection <String> cellBarcodes = null;
UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false),
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false),
GeneFunctionCommandLineBase.DEFAULT_GENE_NAME_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_STRAND_TAG,
GeneFunctionCommandLineBase.DEFAULT_GENE_FUNCTION_TAG, GeneFunctionCommandLineBase.DEFAULT_STRAND_STRATEGY,
GeneFunctionCommandLineBase.DEFAULT_LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, true, false);
this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build();

// get list of barcodes that have enough UMIs, and are not polyT biased.
UMIsPerCellResult umiResult=getUMIsPerCell(umiIterator, this.MIN_UMIS_PER_CELL, this.UMI_BIAS_BASE, this.UMI_BIAS_THRESHOLD, null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ private boolean assertAllArraysSameLength (final Collection<char []> charArrays)
* @param barcodes
* @param findIndels
* @param editDistance
* @return
* @return map from the core barcode to the list of barcodes that are collapsed into it.
*/
public Map<String, List<String>> collapseBarcodes (final ObjectCounter<String> barcodes, final boolean findIndels, final int editDistance) {
List<String> coreBarcodes = barcodes.getKeysOrderedByCount(true);
Expand Down
Loading
Loading