From b0a5c8d13d40ce9c68a5440533edef31ef6df344 Mon Sep 17 00:00:00 2001 From: Alec Wysoker Date: Wed, 22 May 2024 16:43:25 -0400 Subject: [PATCH] Various modes for gene order --- .../dropseqrna/eqtl/CreateMetaCells.java | 175 ++++++++++++++---- .../dropseqrna/eqtl/CreateMetaCellsTest.java | 43 ++++- .../eqtl/cluster_1_2.meta_cells.sorted.txt | 39 ++++ 3 files changed, 219 insertions(+), 38 deletions(-) create mode 100644 testdata/org/broadinstitute/dropseq/eqtl/cluster_1_2.meta_cells.sorted.txt diff --git a/src/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCells.java b/src/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCells.java index 3a2a75ca..7a0e0c51 100644 --- a/src/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCells.java +++ b/src/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCells.java @@ -24,24 +24,15 @@ package org.broadinstitute.dropseqrna.eqtl; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.PrintStream; +import java.io.*; import java.math.BigDecimal; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; +import htsjdk.samtools.util.*; import org.apache.commons.lang3.StringUtils; import org.apache.commons.math3.stat.descriptive.rank.Median; import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineParser; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.dropseqrna.barnyard.ParseBarcodeFile; import org.broadinstitute.dropseqrna.barnyard.digitalexpression.DgeHeader; @@ -50,9 +41,6 @@ import org.broadinstitute.dropseqrna.utils.RetainRemoveList; import org.broadinstitute.dropseqrna.utils.io.ErrorCheckingPrintStream; -import htsjdk.samtools.util.CloserUtil; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.Log; import picard.cmdline.CommandLineProgram; import picard.cmdline.StandardOptionDefinitions; import picard.util.TabbedInputParser; @@ -125,13 +113,16 @@ public class CreateMetaCells extends CommandLineProgram { @Argument(doc="Column label for metacell column in DONOR_MAP.", mutex = {"SINGLE_METACELL_LABEL"}) public String METACELL_COLUMN = "bestSample"; + @Argument(doc="Validate order of genes in the input, or sort (alphanumeric), or emit in input order without sorting.") + public GeneSort GENE_SORT=GeneSort.ALLOW_UNSORTED; + private final String GENE_HEADER="GENE"; private final int PROGRESS_INTERVAL=1000; private void validateClusterAssignmentArguments () { if (this.CLUSTER_ASSIGNMENTS_FILE!=null) { IOUtil.assertFileIsReadable(this.CLUSTER_ASSIGNMENTS_FILE); - if (CLUSTER_ASSIGNMENT==null || CLUSTER_ASSIGNMENT.size()==0) + if (CLUSTER_ASSIGNMENT==null || CLUSTER_ASSIGNMENT.isEmpty()) throw new IllegalArgumentException("If CLUSTER_ASSIGNMENTS_FILE is set, then the CLUSTER_ASSIGNMENT must have at least one value."); if (MERGED_DGE_HEADER_FILE==null && PREFIX==null) throw new IllegalArgumentException("If CLUSTER_ASSIGNMENTS_FILE is set, then must set either MERGED_DGE_HEADER_FILE or PREFIX"); @@ -156,7 +147,7 @@ public int doWork() { TabbedInputParser parser = new TabbedInputParser(false, in); if (!parser.hasNext()) { log.error("No lines in input file" + this.INPUT); - parser.close(); + CloserUtil.close(parser); CloserUtil.close(out); return 1; } @@ -199,23 +190,30 @@ public int doWork() { // write the header for the output file. writeHeader(out, donors); + final MetacellRowWriter rowWriter = new MetacellRowWriter(out, 1 + donors.size()); + // parse through the DGE and write out the meta-cell file. int counter=0; + String prevGene = null; while (parser.hasNext()) { String [] line = parser.next(); String gene = line[0]; + if (GENE_SORT == GeneSort.REQUIRE_SORTED && prevGene != null && gene.compareTo(prevGene) <= 0) { + throw new IllegalArgumentException("Genes are not sorted in input file. Gene '" + gene + "' is out of order."); + } + prevGene = gene; Map umis = getUMIsPerDonor (header, line, donorMap, this.STRATEGY); metrics = addUMIsPerDonor(metrics, umis); List lineParsed = buildBodyLine(gene, umis, donors, this.INTEGER_FORMAT); if (lineParsed!=null) - writeLine(lineParsed, out, false); + rowWriter.writeRow(lineParsed); if (counter > 0 && counter%PROGRESS_INTERVAL==0) log.info("Processed [" + counter +"] lines"); counter++; } log.info("Processed [" + counter +"] lines in total"); // cleanup. - parser.close(); - out.close(); + CloserUtil.close(parser); + rowWriter.close(); // write metrics output if requested if (this.METRICS!=null) metrics.writeMetrics(this.METRICS, this.INTEGER_FORMAT); @@ -265,7 +263,7 @@ List getCellBarcodesInClusters (String prefix, Set clusterLabels private Map filterDonorMapByClusterLabels (String prefix, Collection clusterLabels, final File clusterAssignmentFile, final Map donorMap) { // no op if prefix is unset or cluster assignment is null. if (prefix==null || clusterAssignmentFile==null) return (donorMap); - if (clusterLabels==null || clusterLabels.size()==0) return (donorMap); + if (clusterLabels==null || clusterLabels.isEmpty()) return (donorMap); Set labels=new HashSet<>(clusterLabels); Set cb = new HashSet<> (getCellBarcodesInClusters(prefix, labels, clusterAssignmentFile)); @@ -343,12 +341,8 @@ private Map getUMIsPerDonor (String [] header, String [] line, f double count = Double.parseDouble(line[i]); String donor = donorMap.get(cell); if (donor!=null) { - List vals = umiListPerDonor.get(donor); - if (vals==null) { - vals= new ArrayList<>(); - umiListPerDonor.put(donor, vals); - } - vals.add(count); + List vals = umiListPerDonor.computeIfAbsent(donor, k -> new ArrayList<>()); + vals.add(count); } } @@ -407,7 +401,7 @@ private Set getElementsToRetain(final List elements, final File Set toRetain = getRetainRemoveSet(retain); // in the case where the input retain file is not null, but has 0 entries, there is nothing to retain. - if (retain!=null & toRetain.size()==0) + if (retain!=null & toRetain.isEmpty()) return Collections.emptySet(); RetainRemoveList rrl = new RetainRemoveList<>(); @@ -427,7 +421,7 @@ private void writeHeader (final PrintStream out, final List donors) { List lineOut = new ArrayList<>(); // short-circuit: only write the gene header if there is at least one donor. - if (donors.size()==0) return; + if (donors.isEmpty()) return; lineOut.add(this.GENE_HEADER); lineOut.addAll(donors); @@ -435,10 +429,8 @@ private void writeHeader (final PrintStream out, final List donors) { out.println(b); } - private void writeLine (final List line, final PrintStream out, final boolean addGeneHeader) { + private void writeLine (final List line, final PrintStream out) { List lineOut = new ArrayList<>(line); - if (addGeneHeader) - lineOut.add(0, this.GENE_HEADER); String b = StringUtils.join(lineOut, "\t"); out.println(b); } @@ -473,5 +465,122 @@ public static void main(final String[] args) { System.exit(new CreateMetaCells().instanceMain(args)); } + public enum GeneSort implements CommandLineParser.ClpEnum { + ALLOW_UNSORTED("Emit genes in the order they appear in the input file."), + REQUIRE_SORTED("Fail if the input file is not sorted by gene name."), + SORT("Sort genes by name before writing (uses SortingCollection)."); + + private final String description; + + GeneSort(final String description) { + this.description = description; + } + + public String getHelpDoc() { + return this.description; + } + } + + private static class StringListCodec implements SortingCollection.Codec> { + final int numElements; + private ObjectOutputStream outputStream = null; + private ObjectInputStream inputReader = null; + + public StringListCodec(int numElements) { + this.numElements = numElements; + } + + @Override + public void setOutputStream(OutputStream outputStream) { + try { + this.outputStream = new ObjectOutputStream(outputStream); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + @Override + public void setInputStream(InputStream inputStream) { + try { + this.inputReader = new ObjectInputStream(inputStream); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + @Override + public void encode(List strings) { + if (strings.size() != this.numElements) { + throw new IllegalArgumentException("Expected " + this.numElements + " elements, but got " + strings.size()); + } + strings.stream().forEach(s -> { + try { + this.outputStream.writeObject(s); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + }); + } + + @Override + public List decode() { + List ret = new ArrayList<>(this.numElements); + for (int i = 0; i < this.numElements; ++i) { + try { + ret.add((String) this.inputReader.readObject()); + } catch (EOFException e) { + return null; + } catch (IOException | ClassNotFoundException e) { + throw new RuntimeIOException(e); + } + } + return ret; + } + + @Override + public SortingCollection.Codec> clone() { + return new StringListCodec(numElements); + } + } + + private static class StringListComparator implements Comparator> { + @Override + public int compare(List o1, List o2) { + return o1.getFirst().compareTo(o2.getFirst()); + } + } + + private class MetacellRowWriter{ + private final PrintStream out; + private final SortingCollection> sortingCollection; + public MetacellRowWriter(PrintStream out, int numElements) { + this.out = out; + if (GENE_SORT == GeneSort.SORT) { + final Class c = List.class; + this.sortingCollection = SortingCollection.newInstance( + c, + new StringListCodec(numElements), + new StringListComparator(), + MAX_RECORDS_IN_RAM); + } else { + this.sortingCollection = null; + } + } + + public void writeRow(List row) { + if (GENE_SORT == GeneSort.SORT) { + this.sortingCollection.add(row); + } else { + writeLine(row, out); + } + } + public void close() { + if (GENE_SORT == GeneSort.SORT) { + this.sortingCollection.doneAdding(); + this.sortingCollection.iterator().forEachRemaining(l -> writeLine(l, out)); + } + out.close(); + } + } } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCellsTest.java b/src/tests/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCellsTest.java index 3b5e9b0e..35438437 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCellsTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/eqtl/CreateMetaCellsTest.java @@ -2,6 +2,7 @@ import org.broadinstitute.dropseqrna.utils.TestUtils; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -20,6 +21,7 @@ public class CreateMetaCellsTest { private final File EXPECTED_SINGLE_METACELL = new File(TEST_DATA_DIR, "single_metacell.txt.gz"); private final File DGE = new File("testdata/org/broadinstitute/transcriptome/barnyard/digitalexpression/test_with_header3.unpaired.dge.txt.gz"); private final File EXPECTED_METACELLS_BY_CLUSTER= new File (TEST_DATA_DIR, "cluster_1_2.meta_cells.txt"); + private final File EXPECTED_METACELLS_BY_CLUSTER_SORTED= new File (TEST_DATA_DIR, "cluster_1_2.meta_cells.sorted.txt"); private final File MERGED_DGE_HEADER_FILE=new File(TEST_DATA_DIR, "test_with_header3.unpaired.dge.dge_header.txt"); private final File CLUSTER_ASSIGNMENT_FILE=new File(TEST_DATA_DIR, "test_with_header3.unpaired.dge.assign.txt"); @@ -27,6 +29,13 @@ public class CreateMetaCellsTest { @Test () public void testMetacellsWithICAClusters() throws IOException { + final CreateMetaCells clp = makeBasicTestClp(); + Assert.assertEquals(clp.doWork(), 0); + Assert.assertTrue(TestUtils.testFilesSame(EXPECTED_METACELLS_BY_CLUSTER, clp.OUTPUT)); + + } + + private CreateMetaCells makeBasicTestClp() throws IOException { final CreateMetaCells clp = new CreateMetaCells(); clp.INPUT = DGE; clp.DONOR_MAP = DONOR_MAP; @@ -34,13 +43,37 @@ public void testMetacellsWithICAClusters() throws IOException { clp.CLUSTER_ASSIGNMENT=Arrays.asList("Cluster_1", "Cluster_2"); clp.MERGED_DGE_HEADER_FILE=MERGED_DGE_HEADER_FILE; clp.OUTPUT = File.createTempFile("testMetacellsWithICAClusters.",".metacells.txt.gz"); - clp.OUTPUT.deleteOnExit(); + clp.OUTPUT.deleteOnExit(); + return clp; + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testInputSortRequirement() throws IOException { + final CreateMetaCells clp = makeBasicTestClp(); + clp.GENE_SORT = CreateMetaCells.GeneSort.REQUIRE_SORTED; + clp.doWork(); + + } + @Test(dataProvider = "testMetacellsWithGeneSortDataProvider") + public void testMetacellsWithGeneSort(Integer maxRecordsInRam) throws IOException { + final CreateMetaCells clp = makeBasicTestClp(); + clp.GENE_SORT = CreateMetaCells.GeneSort.SORT; + if (maxRecordsInRam != null) { + clp.MAX_RECORDS_IN_RAM = maxRecordsInRam; + } Assert.assertEquals(clp.doWork(), 0); - Assert.assertTrue(TestUtils.testFilesSame(EXPECTED_METACELLS_BY_CLUSTER, clp.OUTPUT)); - + Assert.assertTrue(TestUtils.testFilesSame(EXPECTED_METACELLS_BY_CLUSTER_SORTED, clp.OUTPUT)); } - - + + // One invocation that is small enough that SortingCollection spills to disk. + @DataProvider(name = "testMetacellsWithGeneSortDataProvider") + public Object[][] testMetacellsWithGeneSortDataProvider() { + return new Object[][]{ + {null}, + {3} + }; + } + @Test public void testGetCellBarcodesInClusters() { final CreateMetaCells clp = new CreateMetaCells(); diff --git a/testdata/org/broadinstitute/dropseq/eqtl/cluster_1_2.meta_cells.sorted.txt b/testdata/org/broadinstitute/dropseq/eqtl/cluster_1_2.meta_cells.sorted.txt new file mode 100644 index 00000000..b944a0f1 --- /dev/null +++ b/testdata/org/broadinstitute/dropseq/eqtl/cluster_1_2.meta_cells.sorted.txt @@ -0,0 +1,39 @@ +GENE 1 2 3 +ACADM 20 22 23 +ACTA1 0 1 0 +AGO3 13 13 17 +AGO4 3 1 9 +AHCTF1 13 15 6 +AHCYL1 12 14 15 +AKR1A1 41 31 34 +AMIGO1 1 1 3 +AMY2B 1 2 0 +ANGEL2 6 3 8 +ANKRD13C 19 20 10 +APH1A 8 8 7 +ARF1 57 74 64 +ARHGEF2 1 2 2 +ARPC5 146 180 151 +ARV1 15 8 9 +ASCL5 2 0 0 +ASH1L 31 37 25 +ASH1L-AS1 2 0 4 +ATAD3C 0 1 0 +ATP13A2 22 30 20 +B4GALT3 7 13 10 +BCAN 0 0 1 +BCAS2 32 25 22 +BGLAP 0 1 0 +BTBD19 1 0 0 +C1orf101 0 0 1 +C1orf112 1 0 1 +C1orf122 72 73 66 +C1orf220 0 0 1 +C1orf228 0 0 1 +C1orf35 10 8 5 +C1orf56 7 2 4 +C1orf95 2 0 1 +CACHD1 1 3 2 +CAPN2 9 13 9 +CC2D1B 1 0 0 +CCNL2 23 23 15