Skip to content

Commit

Permalink
UMIIteratorBuilder
Browse files Browse the repository at this point in the history
  • Loading branch information
alecw committed Jul 3, 2024
1 parent 3df3e34 commit 139bc61
Show file tree
Hide file tree
Showing 11 changed files with 132 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,10 @@ private void digitalExpression(List<String> cellBarcodes) {
writeDgeHeader(out);

// TODO should the ambiguous reads handling be a parameter? It's set to false by default for DGE to get rid of ambiguous gene assignments on reads
UMIIterator realUMIIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(this.INPUT), false),
UMIIterator realUMIIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(Collections.singletonList(this.INPUT), false),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY,
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, false, cellBarcodes,
false, OMIT_MISSING_CELLS);
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ).setCellBarcodes(cellBarcodes).
recordCellsInInput(OMIT_MISSING_CELLS).build();
CloseableIterator<UMICollection> umiIterator = realUMIIterator;

if (OMIT_MISSING_CELLS) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,10 @@ protected int doWork() {
this.CELL_BC_FILE, this.READ_MQ, this.MIN_NUM_TRANSCRIPTS_PER_CELL,
this.MIN_NUM_GENES_PER_CELL, this.MIN_NUM_READS_PER_CELL, this.NUM_CORE_BARCODES, this.EDIT_DISTANCE, this.MIN_BC_READ_THRESHOLD));

UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false),
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, true, false);
this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build();

UMICollection batch;

Expand Down Expand Up @@ -163,10 +163,10 @@ public ObjectCounter<String> getNumTranscriptsPerCell (final List<File> bamFile,
SamReaderFactory factory= SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.EAGERLY_DECODE);
SamHeaderAndIterator headerIterator= SamFileMergeUtil.mergeInputs(bamFile, false, factory);

UMIIterator umiIterator = new UMIIterator(headerIterator,
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerIterator,
geneNameTag, strandTag, geneFunctionTag,
strategy, locusFunctionList, functionStrategy, cellBarcodeTag, molecularBarcodeTag,
mapQuality, false, cellBarcodes);
mapQuality).setCellBarcodes(cellBarcodes).build();

ObjectCounter<String> transcriptsPerCell = new ObjectCounter<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,8 @@
import org.broadinstitute.dropseqrna.utils.readiterators.GeneFunctionProcessor;
import org.broadinstitute.dropseqrna.utils.readiterators.SamFileMergeUtil;
import org.broadinstitute.dropseqrna.utils.readiterators.SamHeaderAndIterator;
import org.broadinstitute.dropseqrna.utils.readiterators.StrandStrategy;
import org.broadinstitute.dropseqrna.utils.readiterators.UMIIterator;
import picard.cmdline.StandardOptionDefinitions;
import picard.illumina.BarcodeMetric;

import java.io.BufferedWriter;
import java.io.File;
Expand Down Expand Up @@ -224,10 +222,10 @@ private Map<String, ChimericUmiCollection> identifyChimericsAndWriteReport(boole
final Set<String> cellBarcodes=getCellBarcodes();

PeekableIterator<UMICollection> umiIterator = new PeekableIterator<>(
new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false),
new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, true, false));
this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build());

// Remember {CBC, UMI, Gene} pairs to be marked chimeric
final Map<String, ChimericUmiCollection> chimerics = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ protected int doWork() {
mapContainer = new SingleOrganismMapContainer(cellBarcodes);

// gene/exon tags are sorted first, followed by cells
UMIIterator umiIterator = new UMIIterator(headerAndIterator, GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerAndIterator, GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes);
this.READ_MQ).setCellBarcodes(cellBarcodes).build();


String gene = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ protected int doWork() {

FilteredUmiMetrics metrics = new FilteredUmiMetrics();

UMIIterator umiIterator = new UMIIterator(headerAndIter,GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(headerAndIter,GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, false, false, true, null);
this.READ_MQ).setCellBarcodes(cellBarcodes).retainReads(true).build();

while (umiIterator.hasNext()) {
UMICollection c = umiIterator.next();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ public class BiasedBarcodeCollectionFactory {
public UMIIterator prepareUMIIterator(final List<File> inputFiles, final String geneExonTag, final String cellBarcodeTag, final String molBCTag, final String strandTag,
final int readMQ, final List<String> cellBarcodes) {

return new UMIIterator(SamFileMergeUtil.mergeInputs(inputFiles, false, samReaderFactory),
return new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(inputFiles, false, samReaderFactory),
GeneFunctionCommandLineBase.DEFAULT_GENE_NAME_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_STRAND_TAG,
GeneFunctionCommandLineBase.DEFAULT_GENE_FUNCTION_TAG, GeneFunctionCommandLineBase.DEFAULT_STRAND_STRATEGY,
GeneFunctionCommandLineBase.DEFAULT_LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
cellBarcodeTag, molBCTag, readMQ, false, cellBarcodes, true, false);
cellBarcodeTag, molBCTag, readMQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build();

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -588,10 +588,10 @@ public String fixUMI (final String cellBarcode, final String umi, final int erro
public UMIIterator prepareUMIIterator() {
List<String> barcodes=getCellBarcodes();

UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(INPUT, false, samReaderFactory),
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(INPUT, false, samReaderFactory),
GENE_NAME_TAG, GENE_STRAND_TAG, GENE_FUNCTION_TAG,
this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, this.FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, barcodes, true, false);
this.READ_MQ).setCellBarcodes(barcodes).cellFirstSort(true).build();

return (umiIterator);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,12 @@ protected int doWork() {

// build up the UMI per cell data set.
Collection <String> cellBarcodes = null;
UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(this.INPUT, false),
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(this.INPUT, false),
GeneFunctionCommandLineBase.DEFAULT_GENE_NAME_TAG, GeneFunctionCommandLineBase.DEFAULT_GENE_STRAND_TAG,
GeneFunctionCommandLineBase.DEFAULT_GENE_FUNCTION_TAG, GeneFunctionCommandLineBase.DEFAULT_STRAND_STRATEGY,
GeneFunctionCommandLineBase.DEFAULT_LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG,
this.READ_MQ, false, cellBarcodes, true, false);
this.READ_MQ).setCellBarcodes(cellBarcodes).cellFirstSort(true).build();

// get list of barcodes that have enough UMIs, and are not polyT biased.
UMIsPerCellResult umiResult=getUMIsPerCell(umiIterator, this.MIN_UMIS_PER_CELL, this.UMI_BIAS_BASE, this.UMI_BIAS_THRESHOLD, null);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,72 +49,7 @@ public class UMIIterator implements CloseableIterator<UMICollection> {
private final StringInterner stringCache = new StringInterner();
private final Set<String> cellBarcodesSeen;
private final boolean retainReads;
/**
* Construct an object that generates UMI objects from a BAM file
* @param headerAndIterator The BAM records to extract UMIs from
* @param geneTag The gene tag on BAM records
* @param cellBarcodeTag The cell barcode tag on BAM records
* @param molecularBarcodeTag The molecular barcode tag on BAM records
* @param geneStrandTag The strand tag on BAM records
* @param readMQ The minimum map quality of the reads
* @param assignReadsToAllGenes Should records tagged with multiple genes be double counted, once for each gene?
* @param strandStrategy should the gene and read strand match for the read to be accepted
* @param cellBarcodes The list of cell barcode tag values that match the <cellBarcodeTag> tag on the BAM records.
* Only reads with these values will be used. If set to null, all cell barcodes are used.
*/
public UMIIterator(final SamHeaderAndIterator headerAndIterator,
final String geneTag,
final String geneStrandTag,
final String geneFunctionTag,
final StrandStrategy strandStrategy,
final Collection <LocusFunction> acceptedLociFunctions,
FunctionalDataProcessorStrategy functionStrategy,
final String cellBarcodeTag,
final String molecularBarcodeTag,
final int readMQ,
final boolean assignReadsToAllGenes,
final Collection<String> cellBarcodes) {
this(headerAndIterator, geneTag, geneStrandTag, geneFunctionTag, strandStrategy, acceptedLociFunctions, functionStrategy,
cellBarcodeTag, molecularBarcodeTag, readMQ, assignReadsToAllGenes, cellBarcodes, false, false);
}

/**
* Construct an object that generates UMI objects from a BAM file
* @param headerAndIterator The BAM records to extract UMIs from
* @param geneTag The geneExon tag on BAM records
* @param cellBarcodeTag The cell barcode tag on BAM records
* @param molecularBarcodeTag The molecular barcode tag on BAM records
* @param geneStrandTag The strand tag on BAM records
* @param readMQ The minimum map quality of the reads
* @param assignReadsToAllGenes Should records tagged with multiple genes be double counted, once for each gene?
* @param strandStrategy should the gene and read strand match for the read to be accepted
* @param cellBarcodes The list of cell barcode tag values that match the <cellBarcodeTag> tag on the BAM records.
* Only reads with these values will be used. If set to null, all cell barcodes are used.
* @param cellFirstSort if true, then cell barcodes are sorted first, followed by gene/exon tags.
* If false, then gene/exon tags are sorted first, followed by cells. false is the default and used in the other constructor.
* @param recordCellsInInput While sorting the input, keep track of what cells appear in the input. This record
* is not complete until iteration is started.
*/
public UMIIterator(final SamHeaderAndIterator headerAndIterator,
final String geneTag,
final String geneStrandTag,
final String geneFunctionTag,
final StrandStrategy strandStrategy,
final Collection <LocusFunction> acceptedLociFunctions,
FunctionalDataProcessorStrategy functionStrategy,
final String cellBarcodeTag,
final String molecularBarcodeTag,
final int readMQ,
final boolean assignReadsToAllGenes,
final Collection<String> cellBarcodes,
final boolean cellFirstSort,
final boolean recordCellsInInput) {

this(headerAndIterator, geneTag, geneStrandTag, geneFunctionTag, strandStrategy, acceptedLociFunctions, functionStrategy,
cellBarcodeTag, molecularBarcodeTag, readMQ, assignReadsToAllGenes, cellBarcodes, cellFirstSort, recordCellsInInput, false,
null);

}
/**
* Construct an object that generates UMI objects from a BAM file
* @param headerAndIterator The BAM records to extract UMIs from
Expand All @@ -135,7 +70,7 @@ public UMIIterator(final SamHeaderAndIterator headerAndIterator,
* This is false in other method signatures by default. If false, reads can be simplified for faster serialization.
*
*/
public UMIIterator(final SamHeaderAndIterator headerAndIterator,
private UMIIterator(final SamHeaderAndIterator headerAndIterator,
final String geneTag,
final String geneStrandTag,
final String geneFunctionTag,
Expand Down Expand Up @@ -376,4 +311,105 @@ public int compare(SAMRecord o1, SAMRecord o2) {
return comp.fileOrderCompare(o1, o2);
}
}

public static class UMIIteratorBuilder {
// required parameters
private final SamHeaderAndIterator headerAndIterator;
private final String geneTag;
private final String geneStrandTag;
private final String geneFunctionTag;
private final StrandStrategy strandStrategy;
private final Collection <LocusFunction> acceptedLociFunctions;
private final FunctionalDataProcessorStrategy functionStrategy;
private final String cellBarcodeTag;
private final String molecularBarcodeTag;
private final int readMQ;

// parameters with default values
private boolean assignReadsToAllGenes=false;
private boolean cellFirstSort=false;
private boolean recordCellsInInput=false;
private boolean retainReads=false;

// nullable parameters
private Collection<String> cellBarcodes;
private IntervalList intervals;

public UMIIteratorBuilder(SamHeaderAndIterator headerAndIterator, String geneTag, String geneStrandTag,
String geneFunctionTag, StrandStrategy strandStrategy,
Collection<LocusFunction> acceptedLociFunctions,
FunctionalDataProcessorStrategy functionStrategy, String cellBarcodeTag,
String molecularBarcodeTag, int readMQ) {
this.headerAndIterator = headerAndIterator;
this.geneTag = geneTag;
this.geneStrandTag = geneStrandTag;
this.geneFunctionTag = geneFunctionTag;
this.strandStrategy = strandStrategy;
this.acceptedLociFunctions = acceptedLociFunctions;
this.functionStrategy = functionStrategy;
this.cellBarcodeTag = cellBarcodeTag;
this.molecularBarcodeTag = molecularBarcodeTag;
this.readMQ = readMQ;
}

public boolean isAssignReadsToAllGenes() {
return assignReadsToAllGenes;
}

public UMIIteratorBuilder assignReadsToAllGenes(boolean assignReadsToAllGenes) {
this.assignReadsToAllGenes = assignReadsToAllGenes;
return this;
}

public boolean isCellFirstSort() {
return cellFirstSort;
}

public UMIIteratorBuilder cellFirstSort(boolean cellFirstSort) {
this.cellFirstSort = cellFirstSort;
return this;
}

public boolean isRecordCellsInInput() {
return recordCellsInInput;
}

public UMIIteratorBuilder recordCellsInInput(boolean recordCellsInInput) {
this.recordCellsInInput = recordCellsInInput;
return this;
}

public boolean isRetainReads() {
return retainReads;
}

public UMIIteratorBuilder retainReads(boolean retainReads) {
this.retainReads = retainReads;
return this;
}

public Collection<String> getCellBarcodes() {
return cellBarcodes;
}

public UMIIteratorBuilder setCellBarcodes(Collection<String> cellBarcodes) {
this.cellBarcodes = cellBarcodes;
return this;
}

public IntervalList getIntervals() {
return intervals;
}

public UMIIteratorBuilder setIntervals(IntervalList intervals) {
this.intervals = intervals;
return this;
}

public UMIIterator build() {
return new UMIIterator(headerAndIterator, geneTag, geneStrandTag, geneFunctionTag, strandStrategy,
acceptedLociFunctions, functionStrategy, cellBarcodeTag, molecularBarcodeTag, readMQ,
assignReadsToAllGenes, cellBarcodes, cellFirstSort, recordCellsInInput, retainReads, intervals);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,9 @@ public class DigitalExpressionTest {
private static UMIIterator getUMIIterator (final File inFile) {
List<String> cellBarcodes = Arrays.asList(DigitalExpressionTestUtil.barcodes);

UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG,
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG,
GENE_STRAND_TAG, GENE_FUNCTION_TAG, STRAND_STRATEGY, LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
CELL_BARCODE_TAG, MOLECULAR_BARCODE_TAG, READ_MQ, true, cellBarcodes, false, false,
false, null);
CELL_BARCODE_TAG, MOLECULAR_BARCODE_TAG, READ_MQ).assignReadsToAllGenes(true).setCellBarcodes(cellBarcodes).build();

return (umiIterator);
}
Expand Down Expand Up @@ -202,13 +201,16 @@ public void DGEIntegrationTest () {
Assert.assertNotEquals(count,0);
}

@Test
public void testTwoGenesOnSameStrand () {
List barcodes = Collections.singletonList("FOO");
File inFile = new File ("");

UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG,
GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
this.CELL_BARCODE_TAG, this.MOLECULAR_BARCODE_TAG, this.READ_MQ, true, barcodes);
UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(
SamFileMergeUtil.mergeInputs(Collections.singletonList(inFile), false), GENE_NAME_TAG,
GENE_STRAND_TAG, GENE_FUNCTION_TAG, this.STRAND_STRATEGY, this.LOCUS_FUNCTION_LIST,
GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY, this.CELL_BARCODE_TAG,
this.MOLECULAR_BARCODE_TAG, this.READ_MQ).setAssignReadsToAllGenes(true).setCellBarcodes(barcodes).build();
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,12 @@ public void testDownsample() {
// Do UMI collapse and confirm that the UMI counts are the same as DigitalExpressionTest.doWork()
@Test
public void testEditDistanceCollapse() {
final UMIIterator umiIterator = new UMIIterator(SamFileMergeUtil.mergeInputs(Collections.singletonList(DigitalExpressionTestUtil.IN_FILE), false),
final UMIIterator umiIterator = new UMIIterator.UMIIteratorBuilder(SamFileMergeUtil.mergeInputs(Collections.singletonList(DigitalExpressionTestUtil.IN_FILE), false),
DigitalExpressionTest.GENE_NAME_TAG,
DigitalExpressionTest.GENE_STRAND_TAG, DigitalExpressionTest.GENE_FUNCTION_TAG, DigitalExpressionTest.STRAND_STRATEGY,
DigitalExpressionTest.LOCUS_FUNCTION_LIST, GeneFunctionCommandLineBase.DEFAULT_FUNCTIONAL_STRATEGY,
"XC", DigitalExpressionTest.MOLECULAR_BARCODE_TAG, DigitalExpressionTest.READ_MQ,
false, Arrays.asList(DigitalExpressionTestUtil.barcodes), false, false,
true, null);
"XC", DigitalExpressionTest.MOLECULAR_BARCODE_TAG, DigitalExpressionTest.READ_MQ).
setCellBarcodes(Arrays.asList(DigitalExpressionTestUtil.barcodes)).retainReads(true).build();

final Map<Pair<String, String>, Integer> umiCounts = new HashMap<>();
for (final UMICollection c : new IterableAdapter<UMICollection>(umiIterator)) {
Expand Down

0 comments on commit 139bc61

Please sign in to comment.