Jn compare bam tag values (#487)

* Working on CompareBAMTagValues * Working on CompareBAMTagValues * Working on CompareBAMTagValues This now passes existing unit tests that use it. It needs its own set of unit tests to validate that the outputs look good, but the basic processing/comparisons of two BAM files looks solid. * CompareBAMTagValues now passes all unit tests, ready for testing on larger data sets. * CompareBAMTagValues now passes all unit tests, ready for testing on larger data sets. * Added the ability to write reads private to one data set to a BAM output file. * Added DNACompressor unit tests. * Updated DNACompressor to handle N bases. * Updated DNACompressor to handle N bases. * Updated DNACompressor to handle N bases. * Fixed CompareBAMTagValues bug where a paired read BAM compared to a single read bam could flip the tag values in the output file so the tags were associated with the wrong data set. * Lots of changes to maintain code coverage. Thanks chatGPT. * Lots of changes to maintain code coverage. Thanks chatGPT.
broadinstitute · Jan 15, 2025 · bf3086b · bf3086b
1 parent 32cf96a
commit bf3086b
Show file tree

Hide file tree

Showing 25 changed files with 2,173 additions and 185 deletions.
diff --git a/src/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapper.java b/src/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapper.java
@@ -0,0 +1,34 @@
+package org.broadinstitute.dropseqrna.utils;
+
+import java.util.Arrays;
+
+public class ByteArrayWrapper implements Comparable<ByteArrayWrapper> {
+    private final byte[] data;
+
+    public ByteArrayWrapper(byte[] data) {
+        this.data = data;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj) return true;
+        if (obj == null || getClass() != obj.getClass()) return false;
+        ByteArrayWrapper that = (ByteArrayWrapper) obj;
+        return Arrays.equals(this.data, that.data);
+    }
+
+    @Override
+    public int hashCode() {
+        return Arrays.hashCode(data);
+    }
+
+    @Override
+    public int compareTo(ByteArrayWrapper other) {
+        return Arrays.compare(this.data, other.data); // Lexicographical comparison
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+}
diff --git a/src/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValues.java b/src/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValues.java
diff --git a/src/java/org/broadinstitute/dropseqrna/utils/DNACompressor.java b/src/java/org/broadinstitute/dropseqrna/utils/DNACompressor.java
@@ -0,0 +1,222 @@
+package org.broadinstitute.dropseqrna.utils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+
+/**
+ * This compresses and decompresses DNA sequences into a byte array.
+ * Each sequence is compressed into a byte array, where each base is represented by 2 bits.
+ * The first byte of the compressed data contains the remainder of the sequence length divided by 4.
+ * The remaining bytes contain the bases packed into 2 bits each.
+ * The decompressed sequence is reconstructed by extracting the bases from the packed bytes.
+
+ * The key difference between this version and DNACompressorVaryingLengths is that this version
+ * does not store the lengths of the original sequences in the compressed data, which can save space if the lengths are short.
+ *
+ */
+public class DNACompressor {
+
+    private static final int BITS_PER_BASE = 3;
+    private static final int BASES_PER_BYTE = 8 / BITS_PER_BASE;
+
+    /**
+     * Compresses a list of DNA strings into a single byte array, including a leading bitmask for null values.
+     *
+     * @param dnaList A list of DNA strings to compress. Each string must contain only 'A', 'C', 'G', 'T' or be null.
+     * @return A byte array representing the compressed DNA sequences with a leading bitmask.
+     */
+    public static byte[] compressList(List<String> dnaList) {
+        int numEntries = dnaList.size();
+        int bitmaskSize = (numEntries + 7) / 8; // 1 bit per entry, rounded up to the nearest byte
+        byte[] bitmask = new byte[bitmaskSize];
+
+        List<byte[]> compressedSequences = new ArrayList<>();
+        int totalCompressedSize = bitmaskSize; // Include the bitmask size
+
+        // Create the bitmask and compress non-null sequences
+        for (int i = 0; i < numEntries; i++) {
+            if (dnaList.get(i) == null) {
+                // Mark null entries in the bitmask
+                bitmask[i / 8] |= (byte) (1 << (i % 8));
+            } else {
+                // Compress non-null sequences
+                byte[] compressed = compress(dnaList.get(i));
+                compressedSequences.add(compressed);
+                totalCompressedSize += compressed.length;
+            }
+        }
+
+        // Pack the bitmask and compressed data into a single byte array
+        byte[] result = new byte[totalCompressedSize];
+        System.arraycopy(bitmask, 0, result, 0, bitmaskSize);
+
+        int offset = bitmaskSize;
+        for (byte[] compressed : compressedSequences) {
+            System.arraycopy(compressed, 0, result, offset, compressed.length);
+            offset += compressed.length;
+        }
+
+        return result;
+    }
+
+
+    /**
+     * Decompresses a single byte array into a list of DNA strings, including handling null values using the bitmask.
+     *
+     * @param compressed The byte array containing the bitmask and compressed DNA sequences.
+     * @param lengths    An array of lengths of the original DNA sequences.
+     * @return A list of decompressed DNA strings, including null values.
+     */
+    /**
+     * Decompresses a single byte array into a list of DNA strings, including handling null values using the bitmask.
+     *
+     * @param compressed The byte array containing the bitmask and compressed DNA sequences.
+     * @param lengths    An array of lengths of the original DNA sequences.
+     * @return A list of decompressed DNA strings, including null values.
+     */
+    public static List<String> decompressList(byte[] compressed, int[] lengths) {
+        int numEntries = lengths.length;
+        int bitmaskSize = (numEntries + 7) / 8;
+
+        // Extract the bitmask
+        byte[] bitmask = new byte[bitmaskSize];
+        System.arraycopy(compressed, 0, bitmask, 0, bitmaskSize);
+
+        List<String> decompressed = new ArrayList<>();
+        int offset = bitmaskSize;
+
+        // Decompress each sequence based on the bitmask
+        for (int i = 0; i < numEntries; i++) {
+            if ((bitmask[i / 8] & (1 << (i % 8))) != 0) {
+                // Null value indicated by the bitmask
+                decompressed.add(null);
+            } else {
+                // Calculate byte length and remainder for the sequence
+                int length = lengths[i];
+                int[] calc = calculateByteLengthAndRemainder(length);
+                int byteLength = calc[0];
+
+                // Extract the compressed data for this sequence
+                byte[] sequenceData = new byte[1 + byteLength];
+                System.arraycopy(compressed, offset, sequenceData, 0, 1 + byteLength);
+
+                // Decompress and add to the list
+                decompressed.add(decompress(sequenceData));
+                offset += (1 + byteLength);
+            }
+        }
+        return decompressed;
+    }
+
+    /**
+     * Compresses a single DNA string into a byte array, supporting 'A', 'C', 'G', 'T', and 'N'.
+     */
+    public static byte[] compress(String dna) {
+        int[] calc = calculateByteLengthAndRemainder(dna.length());
+        int byteLength = calc[0];
+        int remainder = calc[1];
+
+        byte[] compressed = new byte[1 + byteLength];
+        compressed[0] = (byte) remainder;
+
+        for (int i = 0; i < dna.length(); i++) {
+            int base = switch (dna.charAt(i)) {
+                case 'A' -> 0b000;
+                case 'C' -> 0b001;
+                case 'G' -> 0b010;
+                case 'T' -> 0b011;
+                case 'N' -> 0b100;
+                default -> throw new IllegalArgumentException("Invalid base: " + dna.charAt(i));
+            };
+
+            int byteIndex = 1 + (i / BASES_PER_BYTE);
+            int bitPosition = (BASES_PER_BYTE - 1 - (i % BASES_PER_BYTE)) * BITS_PER_BASE;
+            compressed[byteIndex] |= (byte) (base << bitPosition);
+        }
+
+        return compressed;
+    }
+
+    /**
+     * Decompresses a single compressed DNA sequence back into a string, supporting 'A', 'C', 'G', 'T', and 'N'.
+     */
+    public static String decompress(byte[] compressed) {
+        int remainder = compressed[0] & 0xFF;
+        int totalBases = (compressed.length - 1) * BASES_PER_BYTE - ((BASES_PER_BYTE - remainder) % BASES_PER_BYTE);
+
+        StringBuilder dna = new StringBuilder(totalBases);
+
+        for (int i = 0; i < totalBases; i++) {
+            int byteIndex = 1 + (i / BASES_PER_BYTE);
+            int bitPosition = (BASES_PER_BYTE - 1 - (i % BASES_PER_BYTE)) * BITS_PER_BASE;
+            int base = (compressed[byteIndex] >> bitPosition) & 0b111;
+
+            char nucleotide = switch (base) {
+                case 0b000 -> 'A';
+                case 0b001 -> 'C';
+                case 0b010 -> 'G';
+                case 0b011 -> 'T';
+                case 0b100 -> 'N';
+                default -> throw new IllegalStateException("Invalid base bits: " + base);
+            };
+            dna.append(nucleotide);
+        }
+
+        return dna.toString();
+    }
+
+    /**
+     * Calculates the byte length and remainder for a given DNA sequence length.
+     *
+     * @param sequenceLength The length of the DNA sequence.
+     * @return An array where [0] is the byte length and [1] is the remainder.
+     */
+    private static int[] calculateByteLengthAndRemainder(int sequenceLength) {
+        int remainder = sequenceLength % BASES_PER_BYTE;
+        int byteLength = (sequenceLength + BASES_PER_BYTE - 1) / BASES_PER_BYTE;
+        return new int[]{byteLength, remainder};
+    }
+
+
+    /**
+     * Compares two compressed DNA outputs (byte arrays) lexicographically,
+     * skipping the bitmask portion of the array.
+     *
+     * @param lengths The lengths of the entries in the compressed data.
+     * @return Comparator for comparing compressed DNA outputs.
+     */
+    public static Comparator<byte[]> getDecompressedComparator(int[] lengths) {
+        return (a, b) -> {
+            // Decompress both byte arrays into lists of strings
+            List<String> listA = decompressList(a, lengths);
+            List<String> listB = decompressList(b, lengths);
+
+            // Compare the decompressed lists lexicographically
+            int len = Math.min(listA.size(), listB.size());
+            for (int i = 0; i < len; i++) {
+                String valueA = listA.get(i);
+                String valueB = listB.get(i);
+
+                // Null values are treated as smaller than non-null values
+                if (valueA == null && valueB == null) {
+                    continue; // Equal
+                } else if (valueA == null) {
+                    return -1; // Null < Non-null
+                } else if (valueB == null) {
+                    return 1; // Non-null > Null
+                }
+
+                // Compare non-null strings
+                int cmp = valueA.compareTo(valueB);
+                if (cmp != 0) {
+                    return cmp; // Return as soon as a difference is found
+                }
+            }
+
+            // If all compared strings are equal, the shorter list is smaller
+            return Integer.compare(listA.size(), listB.size());
+        };
+    }
+}