Skip to content

Commit

Permalink
Various modes for gene order
Browse files Browse the repository at this point in the history
  • Loading branch information
alecw committed May 22, 2024
1 parent d2be0f7 commit b0a5c8d
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 38 deletions.
175 changes: 142 additions & 33 deletions src/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCells.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,15 @@

package org.broadinstitute.dropseqrna.eqtl;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.PrintStream;
import java.io.*;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.*;

import htsjdk.samtools.util.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineParser;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.dropseqrna.barnyard.ParseBarcodeFile;
import org.broadinstitute.dropseqrna.barnyard.digitalexpression.DgeHeader;
Expand All @@ -50,9 +41,6 @@
import org.broadinstitute.dropseqrna.utils.RetainRemoveList;
import org.broadinstitute.dropseqrna.utils.io.ErrorCheckingPrintStream;

import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.util.TabbedInputParser;
Expand Down Expand Up @@ -125,13 +113,16 @@ public class CreateMetaCells extends CommandLineProgram {
@Argument(doc="Column label for metacell column in DONOR_MAP.", mutex = {"SINGLE_METACELL_LABEL"})
public String METACELL_COLUMN = "bestSample";

@Argument(doc="Validate order of genes in the input, or sort (alphanumeric), or emit in input order without sorting.")
public GeneSort GENE_SORT=GeneSort.ALLOW_UNSORTED;

private final String GENE_HEADER="GENE";
private final int PROGRESS_INTERVAL=1000;

private void validateClusterAssignmentArguments () {
if (this.CLUSTER_ASSIGNMENTS_FILE!=null) {
IOUtil.assertFileIsReadable(this.CLUSTER_ASSIGNMENTS_FILE);
if (CLUSTER_ASSIGNMENT==null || CLUSTER_ASSIGNMENT.size()==0)
if (CLUSTER_ASSIGNMENT==null || CLUSTER_ASSIGNMENT.isEmpty())
throw new IllegalArgumentException("If CLUSTER_ASSIGNMENTS_FILE is set, then the CLUSTER_ASSIGNMENT must have at least one value.");
if (MERGED_DGE_HEADER_FILE==null && PREFIX==null)
throw new IllegalArgumentException("If CLUSTER_ASSIGNMENTS_FILE is set, then must set either MERGED_DGE_HEADER_FILE or PREFIX");
Expand All @@ -156,7 +147,7 @@ public int doWork() {
TabbedInputParser parser = new TabbedInputParser(false, in);
if (!parser.hasNext()) {
log.error("No lines in input file" + this.INPUT);
parser.close();
CloserUtil.close(parser);
CloserUtil.close(out);
return 1;
}
Expand Down Expand Up @@ -199,23 +190,30 @@ public int doWork() {
// write the header for the output file.
writeHeader(out, donors);

final MetacellRowWriter rowWriter = new MetacellRowWriter(out, 1 + donors.size());

// parse through the DGE and write out the meta-cell file.
int counter=0;
String prevGene = null;
while (parser.hasNext()) {
String [] line = parser.next();
String gene = line[0];
if (GENE_SORT == GeneSort.REQUIRE_SORTED && prevGene != null && gene.compareTo(prevGene) <= 0) {
throw new IllegalArgumentException("Genes are not sorted in input file. Gene '" + gene + "' is out of order.");
}
prevGene = gene;
Map<String, Double> umis = getUMIsPerDonor (header, line, donorMap, this.STRATEGY);
metrics = addUMIsPerDonor(metrics, umis);
List<String> lineParsed = buildBodyLine(gene, umis, donors, this.INTEGER_FORMAT);
if (lineParsed!=null)
writeLine(lineParsed, out, false);
rowWriter.writeRow(lineParsed);
if (counter > 0 && counter%PROGRESS_INTERVAL==0) log.info("Processed [" + counter +"] lines");
counter++;
}
log.info("Processed [" + counter +"] lines in total");
// cleanup.
parser.close();
out.close();
CloserUtil.close(parser);
rowWriter.close();

// write metrics output if requested
if (this.METRICS!=null) metrics.writeMetrics(this.METRICS, this.INTEGER_FORMAT);
Expand Down Expand Up @@ -265,7 +263,7 @@ List<String> getCellBarcodesInClusters (String prefix, Set<String> clusterLabels
private Map<String, String> filterDonorMapByClusterLabels (String prefix, Collection<String> clusterLabels, final File clusterAssignmentFile, final Map<String,String> donorMap) {
// no op if prefix is unset or cluster assignment is null.
if (prefix==null || clusterAssignmentFile==null) return (donorMap);
if (clusterLabels==null || clusterLabels.size()==0) return (donorMap);
if (clusterLabels==null || clusterLabels.isEmpty()) return (donorMap);

Set<String> labels=new HashSet<>(clusterLabels);
Set<String> cb = new HashSet<> (getCellBarcodesInClusters(prefix, labels, clusterAssignmentFile));
Expand Down Expand Up @@ -343,12 +341,8 @@ private Map<String, Double> getUMIsPerDonor (String [] header, String [] line, f
double count = Double.parseDouble(line[i]);
String donor = donorMap.get(cell);
if (donor!=null) {
List<Double> vals = umiListPerDonor.get(donor);
if (vals==null) {
vals= new ArrayList<>();
umiListPerDonor.put(donor, vals);
}
vals.add(count);
List<Double> vals = umiListPerDonor.computeIfAbsent(donor, k -> new ArrayList<>());
vals.add(count);
}
}

Expand Down Expand Up @@ -407,7 +401,7 @@ private Set<String> getElementsToRetain(final List<String> elements, final File
Set <String> toRetain = getRetainRemoveSet(retain);

// in the case where the input retain file is not null, but has 0 entries, there is nothing to retain.
if (retain!=null & toRetain.size()==0)
if (retain!=null & toRetain.isEmpty())
return Collections.emptySet();

RetainRemoveList<String> rrl = new RetainRemoveList<>();
Expand All @@ -427,18 +421,16 @@ private void writeHeader (final PrintStream out, final List<String> donors) {
List<String> lineOut = new ArrayList<>();

// short-circuit: only write the gene header if there is at least one donor.
if (donors.size()==0) return;
if (donors.isEmpty()) return;

lineOut.add(this.GENE_HEADER);
lineOut.addAll(donors);
String b = StringUtils.join(lineOut, "\t");
out.println(b);
}

private void writeLine (final List<String> line, final PrintStream out, final boolean addGeneHeader) {
private void writeLine (final List<String> line, final PrintStream out) {
List<String> lineOut = new ArrayList<>(line);
if (addGeneHeader)
lineOut.add(0, this.GENE_HEADER);
String b = StringUtils.join(lineOut, "\t");
out.println(b);
}
Expand Down Expand Up @@ -473,5 +465,122 @@ public static void main(final String[] args) {
System.exit(new CreateMetaCells().instanceMain(args));
}

public enum GeneSort implements CommandLineParser.ClpEnum {
ALLOW_UNSORTED("Emit genes in the order they appear in the input file."),
REQUIRE_SORTED("Fail if the input file is not sorted by gene name."),
SORT("Sort genes by name before writing (uses SortingCollection).");

private final String description;

GeneSort(final String description) {
this.description = description;
}

public String getHelpDoc() {
return this.description;
}
}

private static class StringListCodec implements SortingCollection.Codec<List<String>> {
final int numElements;
private ObjectOutputStream outputStream = null;
private ObjectInputStream inputReader = null;

public StringListCodec(int numElements) {
this.numElements = numElements;
}

@Override
public void setOutputStream(OutputStream outputStream) {
try {
this.outputStream = new ObjectOutputStream(outputStream);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}

@Override
public void setInputStream(InputStream inputStream) {
try {
this.inputReader = new ObjectInputStream(inputStream);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}

@Override
public void encode(List<String> strings) {
if (strings.size() != this.numElements) {
throw new IllegalArgumentException("Expected " + this.numElements + " elements, but got " + strings.size());
}
strings.stream().forEach(s -> {
try {
this.outputStream.writeObject(s);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
});
}

@Override
public List<String> decode() {
List<String> ret = new ArrayList<>(this.numElements);
for (int i = 0; i < this.numElements; ++i) {
try {
ret.add((String) this.inputReader.readObject());
} catch (EOFException e) {
return null;
} catch (IOException | ClassNotFoundException e) {
throw new RuntimeIOException(e);
}
}
return ret;
}

@Override
public SortingCollection.Codec<List<String>> clone() {
return new StringListCodec(numElements);
}
}

private static class StringListComparator implements Comparator<List<String>> {
@Override
public int compare(List<String> o1, List<String> o2) {
return o1.getFirst().compareTo(o2.getFirst());
}
}

private class MetacellRowWriter{
private final PrintStream out;
private final SortingCollection<List<String>> sortingCollection;
public MetacellRowWriter(PrintStream out, int numElements) {
this.out = out;
if (GENE_SORT == GeneSort.SORT) {
final Class c = List.class;
this.sortingCollection = SortingCollection.newInstance(
c,
new StringListCodec(numElements),
new StringListComparator(),
MAX_RECORDS_IN_RAM);
} else {
this.sortingCollection = null;
}
}

public void writeRow(List<String> row) {
if (GENE_SORT == GeneSort.SORT) {
this.sortingCollection.add(row);
} else {
writeLine(row, out);
}
}

public void close() {
if (GENE_SORT == GeneSort.SORT) {
this.sortingCollection.doneAdding();
this.sortingCollection.iterator().forEachRemaining(l -> writeLine(l, out));
}
out.close();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.broadinstitute.dropseqrna.utils.TestUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
Expand All @@ -20,27 +21,59 @@ public class CreateMetaCellsTest {
private final File EXPECTED_SINGLE_METACELL = new File(TEST_DATA_DIR, "single_metacell.txt.gz");
private final File DGE = new File("testdata/org/broadinstitute/transcriptome/barnyard/digitalexpression/test_with_header3.unpaired.dge.txt.gz");
private final File EXPECTED_METACELLS_BY_CLUSTER= new File (TEST_DATA_DIR, "cluster_1_2.meta_cells.txt");
private final File EXPECTED_METACELLS_BY_CLUSTER_SORTED= new File (TEST_DATA_DIR, "cluster_1_2.meta_cells.sorted.txt");
private final File MERGED_DGE_HEADER_FILE=new File(TEST_DATA_DIR, "test_with_header3.unpaired.dge.dge_header.txt");
private final File CLUSTER_ASSIGNMENT_FILE=new File(TEST_DATA_DIR, "test_with_header3.unpaired.dge.assign.txt");



@Test ()
public void testMetacellsWithICAClusters() throws IOException {
final CreateMetaCells clp = makeBasicTestClp();
Assert.assertEquals(clp.doWork(), 0);
Assert.assertTrue(TestUtils.testFilesSame(EXPECTED_METACELLS_BY_CLUSTER, clp.OUTPUT));

}

private CreateMetaCells makeBasicTestClp() throws IOException {
final CreateMetaCells clp = new CreateMetaCells();
clp.INPUT = DGE;
clp.DONOR_MAP = DONOR_MAP;
clp.CLUSTER_ASSIGNMENTS_FILE=this.CLUSTER_ASSIGNMENT_FILE;
clp.CLUSTER_ASSIGNMENT=Arrays.asList("Cluster_1", "Cluster_2");
clp.MERGED_DGE_HEADER_FILE=MERGED_DGE_HEADER_FILE;
clp.OUTPUT = File.createTempFile("testMetacellsWithICAClusters.",".metacells.txt.gz");
clp.OUTPUT.deleteOnExit();
clp.OUTPUT.deleteOnExit();
return clp;
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testInputSortRequirement() throws IOException {
final CreateMetaCells clp = makeBasicTestClp();
clp.GENE_SORT = CreateMetaCells.GeneSort.REQUIRE_SORTED;
clp.doWork();

}
@Test(dataProvider = "testMetacellsWithGeneSortDataProvider")
public void testMetacellsWithGeneSort(Integer maxRecordsInRam) throws IOException {
final CreateMetaCells clp = makeBasicTestClp();
clp.GENE_SORT = CreateMetaCells.GeneSort.SORT;
if (maxRecordsInRam != null) {
clp.MAX_RECORDS_IN_RAM = maxRecordsInRam;
}
Assert.assertEquals(clp.doWork(), 0);
Assert.assertTrue(TestUtils.testFilesSame(EXPECTED_METACELLS_BY_CLUSTER, clp.OUTPUT));

Assert.assertTrue(TestUtils.testFilesSame(EXPECTED_METACELLS_BY_CLUSTER_SORTED, clp.OUTPUT));
}



// One invocation that is small enough that SortingCollection spills to disk.
@DataProvider(name = "testMetacellsWithGeneSortDataProvider")
public Object[][] testMetacellsWithGeneSortDataProvider() {
return new Object[][]{
{null},
{3}
};
}

@Test
public void testGetCellBarcodesInClusters() {
final CreateMetaCells clp = new CreateMetaCells();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
GENE 1 2 3
ACADM 20 22 23
ACTA1 0 1 0
AGO3 13 13 17
AGO4 3 1 9
AHCTF1 13 15 6
AHCYL1 12 14 15
AKR1A1 41 31 34
AMIGO1 1 1 3
AMY2B 1 2 0
ANGEL2 6 3 8
ANKRD13C 19 20 10
APH1A 8 8 7
ARF1 57 74 64
ARHGEF2 1 2 2
ARPC5 146 180 151
ARV1 15 8 9
ASCL5 2 0 0
ASH1L 31 37 25
ASH1L-AS1 2 0 4
ATAD3C 0 1 0
ATP13A2 22 30 20
B4GALT3 7 13 10
BCAN 0 0 1
BCAS2 32 25 22
BGLAP 0 1 0
BTBD19 1 0 0
C1orf101 0 0 1
C1orf112 1 0 1
C1orf122 72 73 66
C1orf220 0 0 1
C1orf228 0 0 1
C1orf35 10 8 5
C1orf56 7 2 4
C1orf95 2 0 1
CACHD1 1 3 2
CAPN2 9 13 9
CC2D1B 1 0 0
CCNL2 23 23 15

0 comments on commit b0a5c8d

Please sign in to comment.