diff --git a/src/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapper.java b/src/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapper.java new file mode 100644 index 00000000..601d12b5 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapper.java @@ -0,0 +1,34 @@ +package org.broadinstitute.dropseqrna.utils; + +import java.util.Arrays; + +public class ByteArrayWrapper implements Comparable { + private final byte[] data; + + public ByteArrayWrapper(byte[] data) { + this.data = data; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + ByteArrayWrapper that = (ByteArrayWrapper) obj; + return Arrays.equals(this.data, that.data); + } + + @Override + public int hashCode() { + return Arrays.hashCode(data); + } + + @Override + public int compareTo(ByteArrayWrapper other) { + return Arrays.compare(this.data, other.data); // Lexicographical comparison + } + + public byte[] getData() { + return data; + } + +} diff --git a/src/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValues.java b/src/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValues.java index 1814244a..fd36d160 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValues.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValues.java @@ -25,22 +25,31 @@ package org.broadinstitute.dropseqrna.utils; import java.io.File; -import java.util.Iterator; -import java.util.List; +import java.io.PrintStream; +import java.util.*; +import java.util.regex.Pattern; +import htsjdk.samtools.*; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.*; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.dropseqrna.cmdline.CustomCommandLineValidationHelper; import org.broadinstitute.dropseqrna.cmdline.DropSeq; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.Log; +import org.broadinstitute.dropseqrna.utils.alignmentcomparison.QueryNameJointIterator; +import org.broadinstitute.dropseqrna.utils.io.ErrorCheckingPrintStream; +import org.broadinstitute.dropseqrna.utils.readiterators.*; import picard.cmdline.CommandLineProgram; +import picard.nio.PicardHtsPath; + +import java.util.stream.IntStream; +import java.util.stream.Collectors; @CommandLineProgramProperties(summary = "Tests that two BAMs have the same TAG values per read. " - + "We assume the two BAMs tested have the same set of reads in the same order, and test the read names " - + "for each iteration match, and the values for the TAGs listed are equal.", + + "This program only considers primary reads, and can optional filter to reads with a minimum mapping quality." + + "Reads are matched by query name string and first or second of the pair (See Picard's SAMRecordQueryNameComparator)." + + "Tags on reads are then compared for equality. If tag values differ or tag values are missing, the read is considered discordant.", oneLineSummary = "Tests that two BAMs have the same TAG values per read.", programGroup = DropSeq.class) @@ -49,66 +58,728 @@ public class CompareBAMTagValues extends CommandLineProgram { private final Log log = Log.getInstance(CompareBAMTagValues.class); @Argument(doc = "The input SAM or BAM file to analyze.", optional=false) - public File INPUT_1; + public List INPUT_1; @Argument(doc = "The input SAM or BAM file to analyze.", optional=false) - public File INPUT_2; + public List INPUT_2; + + @Argument(doc = "Output a version of INPUT_1 containing only the reads that are discordant with INPUT_2", optional = true) + public File BAM_OUTPUT_1; + + @Argument(doc = "Output a version of INPUT_2 containing only the reads that are discordant with INPUT_1", optional = true) + public File BAM_OUTPUT_2; + + @Argument(doc = "Output a version of INPUT_1 containing only the reads that are only observed in INPUT_1", optional = true) + public File BAM_UNIQUE_1; + + @Argument(doc = "Output a version of INPUT_2 containing only the reads that are only observed in INPUT_2", optional = true) + public File BAM_UNIQUE_2; + + + @Argument(doc="A list of tags to test in INPUT_1. These tags are paired with TAGS_2 at the same index. For each index, the values of TAGS_1[i] in INPUT_1 are" + + "compared to the values of TAGS_2[i] in INPUT_2 for equality.", optional=false) + public List TAGS_1; + + @Argument(doc="A list of tags to test in INPUT_2. Values of these tags will be compared across BAMs for equality.", optional=false) + public List TAGS_2; + + @Argument(doc="Some programs (STARsolo) set missing tag values to '-'. Tags with this value are treated as unset for comparisons. Set to null to disable this filter.", optional=true) + public String TAG_MISSING_VALUE = "-"; + + @Argument(doc="Some programs (CellRanger) have suffixes on tags like the cell barcode. This option will trim that suffix from all tag values", optional=true) + public String REMOVE_TAG_SUFFIX="-1"; + + @Argument(doc="Output file of the tag values that differ between the two BAMs. The program records each unique tuple of tag values with the number of reads that contain those values." + + "The column names contain the input identifier I1 or I2, followed by the TAGS_1 and TAGS_2 names for each index.", optional=true) + public File TAG_VALUES_OUTPUT; + + @Argument(doc="If true, only discordant read tag values are tracked and emitted in TAG_VALUES_OUTPUT. For tags with many possible values, this option can be more memory efficient. " + + "If false, all read tag values are emitted.", optional=true) + public Boolean DISCORDANT_READS_ONLY=false; - @Argument(doc="A list of tags to test. Values of these tags will be compared across BAMs for equality.", optional=false) - public List TAGS; + @Argument(doc="A summary of the number of reads that were observed in each BAM. Records the number of reads only in INPUT_1, only in INPUT_2, and in both INPUT_1 and INPUT_2.", optional=true) + public File READ_COUNT_OUTPUT; - @Override + @Argument(doc="If true, the program will exit with an error if any read is encountered that is not in both INPUTS or has TAG values that disagree") + public Boolean STRICT=true; + + @Argument(doc="Restrict analysis to reads with a mapping quality greater than or equal to this value. Reads with a mapping quality less than this value are ignored. Non-primary reads are always ignored.", optional=true) + public int READ_QUALITY=0; + + /* + @Argument(doc="If all tag values consist of the bases A,C,G,T (for example cell or molecular barcodes) then compress the internal representations to byte arrays to reduce memory usage." + + "If your strings as of fixed length, use useFixedLengthBaseCompression instead, as it is even more memory efficient. If values are encountered that can not be compressed the program will fail.", optional=true) + */ + private Boolean useBaseCompression=false; + + @Argument(doc="If all tag values consist of the bases A,C,G,T (for example cell or molecular barcodes) then compress the internal representations to byte arrays to reduce memory usage. These strings must be of fixed length." + + "If values are encountered that can not be compressed the program will fail.", optional=true) + public Boolean useFixedLengthBaseCompression=false; + + // a delimiter to use for the tag values if persisting as strings. + private String TAG_DELIMITER=":"; + + // Determine the mode + private CompressionMode mode; + + // track the tag lengths for the fixed length compression mode. + private int [] tagLengths; + + /** + * There are two ways to encode tag values for counting. + * 1: Keep the tag values as strings, use ":" as a delimiter between values as BAM tag values do not allow that string to be used. + * 2: Encode the tag values as byte arrays, where the byte array is the concatenation of the bytes of the tag values. Values are encoded via Base64 encoding. + */ + ObjectCounter tagValuesString = new ObjectCounter<>(); + ObjectCounter tagValuesByteArray = new ObjectCounter<>(); + + @Override public int doWork() { - IOUtil.assertFileIsReadable(INPUT_1); - IOUtil.assertFileIsReadable(INPUT_2); - - Iterator iterator1 = SamReaderFactory.makeDefault().open(INPUT_1).iterator(); - Iterator iterator2 = SamReaderFactory.makeDefault().open(INPUT_2).iterator(); - - while (iterator1.hasNext() && iterator2.hasNext()) { - SAMRecord r1 = iterator1.next(); - SAMRecord r2 = iterator2.next(); - // log.info(r1.toString() + " tags read 1: " + getTagsAsString(r1, this.TAGS)); - boolean test = compareTags(r1, r2, this.TAGS) ; - if (!test) { - log.info("Difference found for read " + r1.toString()+ " read 1: " + getTagsAsString(r1, this.TAGS)+ " read 2: "+ getTagsAsString(r2, this.TAGS)); + // Expand the input file lists. + this.INPUT_1 = FileListParsingUtils.expandPicardHtsPathList(INPUT_1); + this.INPUT_2 = FileListParsingUtils.expandPicardHtsPathList(INPUT_2); + + // Determine the compression mode for output tags report. + mode = CompressionMode.fromFlags(this.useBaseCompression, this.useFixedLengthBaseCompression); + if (TAG_VALUES_OUTPUT!=null) + log.info("Using tag value compression mode: " + mode); + + // Keep the headers to create output BAM files. + final SamHeaderAndIterator headerAndIter_1 = SamFileMergeUtil.mergeInputPaths( + PicardHtsPath.toPaths(this.INPUT_1), false, SamReaderFactory.makeDefault()); + + final SamHeaderAndIterator headerAndIter_2 = SamFileMergeUtil.mergeInputPaths( + PicardHtsPath.toPaths(this.INPUT_2), false, SamReaderFactory.makeDefault()); + + // these values may be null if the output is not requested. + SAMFileWriter writer_1 = createWriter(headerAndIter_1.header, this.BAM_OUTPUT_1); + SAMFileWriter writer_2 = createWriter(headerAndIter_2.header, this.BAM_OUTPUT_2); + + log.info("Constructing BAM reader for input " + this.INPUT_1.getFirst().toString()); + PeekableIterator> iterator1 = getReadIterator (headerAndIter_1,this.READ_QUALITY, this.TAGS_1); + + log.info("Constructing BAM reader for input " + this.INPUT_2.getFirst().toString()); + PeekableIterator> iterator2 = getReadIterator (headerAndIter_2,this.READ_QUALITY, this.TAGS_2); + + QueryNameJointIterator jointIterator = new QueryNameJointIterator(iterator1, iterator2); + addReadSinks(jointIterator, headerAndIter_1.header, headerAndIter_2.header); + + while (jointIterator.hasNext()) { + QueryNameJointIterator.JointResult jr = jointIterator.next(); + // it's possible one data set is paired and, and the other is not. + // if one read is paired and the other is not, then if either read of the pair is concordant, the pair is concordant. + // if both reads are paired then compare both reads and mark as discordant if either read has discordant tags. + List rList1 = jr.getOne(); + List rList2 = jr.getTwo(); + String readName = rList1.getFirst().getReadName(); + + boolean tagsConcordant = compareTagsReadGroup(rList1, rList2, this.TAGS_1, this.TAGS_2); + if (this.STRICT && !tagsConcordant) { + log.error("Tag values differ for read: " + readName); return 1; } + writeDiscordantReads(writer_1, rList1, writer_2, rList2, tagsConcordant); } + + QueryNameJointIterator.QueryNameJointIteratorMetrics metrics= jointIterator.getMetrics(); + if (this.STRICT && metrics.hasDisjointReads()) { + log.error("Input BAMs have different reads. Exiting with error."); + return 1; + } + + if (this.READ_COUNT_OUTPUT!=null) { + MetricsFile out = getMetricsFile(); + out.addMetric(jointIterator.getMetrics()); + out.write(READ_COUNT_OUTPUT); + } + + writeTagValuesReport(TAG_VALUES_OUTPUT); + + // close the writers. + if (writer_1!=null) + writer_1.close(); + if (writer_2!=null) + writer_2.close(); + log.info("DONE"); return 0; } - - private String getTagsAsString (SAMRecord r, List tags) { - StringBuilder b = new StringBuilder(); - for (String t: tags) { - b.append(t + "[" + r.getStringAttribute(t) +"] "); + + /** + * Convenience method to add read sinks to the joint iterator for reads that are only seen in one of the two input BAMs. + * + * @param jointIterator The joint iterator to add read sinks to. + * @param h1 The header of the first input BAM. + * @param h2 The header of the second input BAM. + */ + private void addReadSinks(QueryNameJointIterator jointIterator, SAMFileHeader h1, SAMFileHeader h2) { + SamWriterSink sink1=null; + SamWriterSink sink2=null; + + if (this.BAM_UNIQUE_1!=null) { + SAMFileWriter writer = createWriter(h1, this.BAM_UNIQUE_1); + sink1 = new SamWriterSink(writer); + } + if (this.BAM_UNIQUE_2!=null) { + SAMFileWriter writer = createWriter(h2, this.BAM_UNIQUE_2); + sink2 = new SamWriterSink(writer); } - return b.toString(); + jointIterator.addReadSinks(sink1, sink2); } - boolean compareTags (final SAMRecord r1, final SAMRecord r2, final List tags) { - boolean sameRead = r1.getReadName().equals(r2.getReadName()); - if (!sameRead) - log.error("Read names differ at iteration. R1: "+ r1.toString(), "R2: ", r2.toString()); + private void writeTagValuesReport (final File output) { + if (this.TAG_VALUES_OUTPUT==null) + return; - for (String tag: tags) { - Object o1 = r1.getAttribute(tag); - Object o2 = r2.getAttribute(tag); - if (o1==null && o2==null) return true; - if ((o1==null && o2!=null ) || (o1!=null && o2==null)) { - log.error("Read tag values differ for tag: ["+ tag.toString()+ "] R1 is null [" + String.valueOf(o1==null) + "] R2 is null [" + String.valueOf(o2==null)+"]"); - return false; + PrintStream out = new ErrorCheckingPrintStream(IOUtil.openFileForWriting(output)); + writeTagValuesHeader(out); + + switch (mode) { + case NO_COMPRESSION -> { + // Sort keys alphanumerically + List sortedKeys = new ArrayList<>(tagValuesString.getKeys()); + sortedKeys.sort(CustomKeyComparator.INSTANCE); // Apply custom sorting by components + // Replace "null" with "NA" after sorting + // sortedKeys.replaceAll(key -> key == null ? "NA" : key.replaceAll("\\bnull\\b", "NA")); + + for (String key : sortedKeys) { + // Get the report line and apply the null -> "NA" transformation + List reportLine = replaceNullsWithNA(getTagValuesReportLine(key)); + out.println(String.join("\t", reportLine)); + } } + case FIXED_LENGTH_COMPRESSION, VARYING_LENGTH_COMPRESSION -> { + // Sort keys by compressed output + List sortedKeys = new ArrayList<>(tagValuesByteArray.getKeys()); + sortedKeys.sort(Comparator.comparing( + ByteArrayWrapper::getData, // Extract raw byte arrays from ByteArrayWrapper + DNACompressor.getDecompressedComparator(this.tagLengths) // Use comparator from DNACompressor + )); + for (ByteArrayWrapper key : sortedKeys) { + // Get the report line and apply the null -> "NA" transformation + List reportLine = replaceNullsWithNA(getTagValuesReportLine(key)); + out.println(String.join("\t", reportLine)); + } + } + } - if (!o1.equals(o2)) { - log.error("Read tag values differ for tag: ["+ tag.toString()+ "] " + r1.toString()+ "[" +o1.toString()+ "] "+ r2.toString() +" [" + o2.toString()+"]"); + } + + // Reusable method to replace null values with "NA" in a list + private static List replaceNullsWithNA(List list) { + if (list != null) { + list.replaceAll(value -> { + if (value == null) { + return "NA"; // Replace completely null values + } + // Replace "null" substrings in concatenated strings with "NA" + return value.replaceAll("\\bnull\\b", "NA"); + }); + } + return list; + } + + private void writeTagValuesHeader (PrintStream out) { + List header = new ArrayList<>(); + header.add("COUNT"); + for (int i=0; i getTagValuesReportLine (String key) { + List result = new ArrayList<>(); + int count = tagValuesString.getCountForKey(key); + String[] tagValues = key.split(this.TAG_DELIMITER); + result.add(Integer.toString(count)); + result.addAll(Arrays.asList(tagValues)); + return (result); + } + + private List getTagValuesReportLine(ByteArrayWrapper key) { + List tagValues; + switch (mode) { + case FIXED_LENGTH_COMPRESSION -> tagValues = DNACompressor.decompressList(key.getData(), this.tagLengths); + // case VARYING_LENGTH_COMPRESSION -> tagValues = DNACompressorVaryingLengths.decompressList(key.getData()); + default -> throw new IllegalStateException("Unexpected compression mode: " + mode); + } + List result = new ArrayList<>(); + int count = tagValuesByteArray.getCountForKey(key); + result.add(Integer.toString(count)); + result.addAll(tagValues); + return (result); + } + + private void writeDiscordantReads (final SAMFileWriter writer1, List r1, final SAMFileWriter writer2, List r2, boolean tagsConcordant) { + if (tagsConcordant) + return; + if (writer1!=null) + r1.forEach(writer1::addAlignment); + if (writer2!=null) + r2.forEach(writer2::addAlignment); + } + + private SAMFileWriter createWriter(SAMFileHeader header, File output) { + // if I don't clone the header, the side effect breaks the main program iterator. + SAMFileHeader outputHeader = header.clone(); + outputHeader.setSortOrder(SAMFileHeader.SortOrder.queryname); + return output != null ? new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, output) : null; + } + + /** + * Compares the tags of reads between two lists to determine if they are concordant. + * + * Logic: + * - If both lists have a single read, compare the tags directly using {@code compareTags}. + * - If one list has a single read, compare it against all reads in the other (multi-read) list: + * - Filter the multi-read list to remove reads missing the required tags. + * - If no matching reads are found, the first read from the filtered list is selected. + * - Determine if any read in the multi-read list is concordant with the single read. + * - If both lists have multiple reads: + * - Verify that both lists have the same size; if not, an exception is thrown. + * - Sort both lists by first/second of pair using {@code PAIRED_READ_ORDER_COMPARATOR}. + * - Compare the tags of reads pairwise. All reads must match for the result to be true. + * + * Behavior: + * - Returns {@code true} if all relevant reads are concordant, {@code false} otherwise. + * - Throws an {@link IllegalStateException} if both lists have multiple reads but different sizes. + * + * @param r1List The first list of reads. This can be a single-read or multi-read list. + * @param r2List The second list of reads. This can also be a single-read or multi-read list. + * @param tags1 The tags to compare in the reads from {@code r1List}. + * @param tags2 The tags to compare in the reads from {@code r2List}. + * @return {@code true} if the tags are concordant across the two lists; {@code false} otherwise. + */ + private boolean compareTagsReadGroup(final List r1List, final List r2List, final List tags1, final List tags2) { + // Handle cases with one list empty (optional, depending on your requirements) + if (r1List.isEmpty() || r2List.isEmpty()) { + return false; + } + + // Both lists have a single read + if (r1List.size() == 1 && r2List.size() == 1) { + return compareTags(r1List.getFirst(), r2List.getFirst(), tags1, tags2, true); + } + + // One list has a single read, compare against all reads in the other list + if (r1List.size() == 1 || r2List.size() == 1) { + if (r1List.size()>1) { + boolean result = processReadLists(r1List, r2List, tags1, tags2, true); + return result; + } else { + boolean result = processReadLists(r2List, r1List, tags2, tags1, false); + return result; + } + } + + // Both lists have multiple reads, sort and compare pairwise + if (r1List.size() != r2List.size()) { + throw new IllegalStateException("Both lists have multiple reads, but different sizes. This should never happen."); + // return false; // Different sizes cannot match pairwise + } + + r1List.sort(PAIRED_READ_ORDER_COMPARATOR); + r2List.sort(PAIRED_READ_ORDER_COMPARATOR); + + for (int i = 0; i < r1List.size(); i++) { + SAMRecord r1 = r1List.get(i); + SAMRecord r2 = r2List.get(i); + + // If any pair does not match, return false immediately + if (!compareTags(r1, r2, tags1, tags2, true)) { return false; } + } + + return true; // All pairs matched + } + + /** + * Processes a multi-read list and a single-read list to find concordant reads based on specific tags. + * + * This method filters the multi-read list based on the presence of all required tags (`tagsMultiread`), + * then attempts to find concordant reads between the filtered multi-read list and the single-read list. + * If no concordant reads are found, it selects the first record from the filtered multi-read list. + * + * The multiReadListFirst indicates if the multi-read list contains data from the first input file - this is critical + * so that the tag values can be saved in the correct order in the output report. + * + * @param multiReadList The list of reads containing multiple records. + * @param singleReadList The list of reads containing a single record. + * @param tagsMultiread A list of tags used for filtering and comparison in the multi-read list. + * @param tagsSingleRead A list of tags used for comparison between records in both lists. + * @param multiReadListFirst If true, the multiReadList contains data from the first input file. + * @return A boolean value indicating whether concordant reads were found and processed. + */ + public boolean processReadLists(List multiReadList, List singleReadList, + List tagsMultiread, List tagsSingleRead, boolean multiReadListFirst) { + + // Filter the multi-read list based on tagsMultiread + // This gets rid of reads that don't have the tags of interest. + List filteredMultiReadList = multiReadList.stream() + .filter(record -> tagsMultiread.stream().allMatch(tag -> record.getStringAttribute(tag) != null)) + .collect(Collectors.toList()); + + // If the filtered list is empty, add the first record from the original multi-read list + // if no reads have the tags of interest, then we're going to eventually record that. + if (filteredMultiReadList.isEmpty()) { + filteredMultiReadList.add(multiReadList.getFirst()); + } + + // Explicitly handle the case where both lists have exactly one read + // The data has been simplified back to the base case of a single read in each list that can be compared. + if (filteredMultiReadList.size() == 1 && singleReadList.size() == 1) { + // make sure the multi read and single read tags are in the right order to record tag values. + if (multiReadListFirst) { + return compareTags(filteredMultiReadList.getFirst(), singleReadList.getFirst(), tagsMultiread, tagsSingleRead, true); + } else { + return compareTags(singleReadList.getFirst(), filteredMultiReadList.getFirst(), tagsSingleRead, tagsMultiread, true); + } + + } + + // There's still multiple reads in the multi-read list, so we need to find the concordant read. + // Attempt to find concordant reads between the filtered multi-read list and the single read + // if there were multiple reads that could be concordant, try to find the concordant read. + SAMRecord singleReadRecord = singleReadList.getFirst(); + List matchingIndices = IntStream.range(0, filteredMultiReadList.size()) + .filter(index -> compareTags(filteredMultiReadList.get(index), singleReadRecord, tagsMultiread, tagsSingleRead, false)) + .boxed() + .toList(); + + SAMRecord selectedMultiReadRecord; + if (matchingIndices.isEmpty()) { + // If no concordant reads, select the first record from the filtered list + selectedMultiReadRecord = filteredMultiReadList.getFirst(); + } else { + // If a concordant reads exist, select the first matching record + selectedMultiReadRecord = filteredMultiReadList.get(matchingIndices.getFirst()); + } + + // Save the tag values for the selected pair of records and return the result of compareTags + return compareTags(selectedMultiReadRecord, singleReadRecord, tagsMultiread, tagsSingleRead, true); + } + + + private boolean compareTags(final SAMRecord r1, final SAMRecord r2, final List tags1, final List tags2, boolean saveTagValues) { + List tagValuesList = new ArrayList<>(); + boolean tagValuesConcordant = true; + + for (int i = 0; i < tags1.size(); i++) { + String v1 = getTagValue(r1, tags1.get(i)); + String v2 = getTagValue(r2, tags2.get(i)); + + tagValuesList.add(v1); + tagValuesList.add(v2); + + // Case 1: Both values are null (concordant) + if (v1 == null && v2 == null) { + continue; + } + + // Case 2: One value is null and the other is not (discordant) + if (v1 == null || v2 == null) { + tagValuesConcordant = false; + continue; + } + // Case 3: Both values are non-null; check equality + if (!v1.equals(v2)) { + tagValuesConcordant = false; + continue; + } } - return true; + + if (saveTagValues) + saveTagValues(tagValuesList, tagValuesConcordant); + + return tagValuesConcordant; + } + + + private String getTagValue (final SAMRecord r, final String tag) { + String result = r.getStringAttribute(tag); + if (result==null) + return null; + if (result.equals(this.TAG_MISSING_VALUE)) + result=null; + return result; + } + + private void saveTagValues (List tagValuesList, boolean tagValuesConcordant) { + if (DISCORDANT_READS_ONLY && tagValuesConcordant) + return; + // short circuit if the output is null. + if (TAG_VALUES_OUTPUT==null) + return; + + switch (mode) { + case NO_COMPRESSION -> { + String tagValues = String.join(TAG_DELIMITER, tagValuesList); + tagValuesString.increment(tagValues); + } + case FIXED_LENGTH_COMPRESSION -> { + byte[] tagValuesBytes = DNACompressor.compressList(tagValuesList); + // special case, where tag values must have consistent lengths. + if (!validateTagLengths(tagValuesList)) + throw new IllegalArgumentException("Tag values have inconsistent lengths, but fixed length compression is enabled. Tag values: " + tagValuesList); + tagValuesByteArray.increment(new ByteArrayWrapper(tagValuesBytes)); + } + /* + case VARYING_LENGTH_COMPRESSION -> { + byte[] tagValuesBytes = DNACompressorVaryingLengths.compressList(tagValuesList); + tagValuesByteArray.increment(new ByteArrayWrapper(tagValuesBytes)); + } + */ + } + } + + /** + * Validate that the tag values have consistent lengths so they can be decoded later. + * Missing data is encoded with a length of 0 and is always valid. + * If a tag length is initialized as non-zero length it will always need to maintain that length. + * If the tag length is missing, it will be updated when a non-zero length is observed, but will then be fixed to that length. + * @param tagValuesList The list of tag values to validate. + * @return True if the tag values have consistent lengths, false otherwise. + */ + private boolean validateTagLengths(List tagValuesList) { + // Compute the lengths of the current tag values, treating nulls as length 0 + int[] lengths = tagValuesList.stream() + .mapToInt(tag -> tag == null ? 0 : tag.length()) + .toArray(); + + // Initialize tagLengths if not already set + if (this.tagLengths == null) { + this.tagLengths = lengths; + return true; // Always valid on first initialization + } + + // Validate and update tagLengths dynamically + boolean valid = true; + for (int i = 0; i < lengths.length; i++) { + if (lengths[i] == 0) { + // Current length is 0; no conflict, continue + continue; + } + + if (this.tagLengths[i] == 0) { + // Update stored tagLengths if the current length is non-zero + this.tagLengths[i] = lengths[i]; + } else if (this.tagLengths[i] != lengths[i]) { + // Conflict: current length does not match stored value + valid = false; + } + } + + return valid; + } + + + private PeekableIterator> getReadIterator (SamHeaderAndIterator headerAndIter, final Integer readQuality, List requiredTags) { + // sort the data by query name. + // Iterator iter = getQueryNameSortedData(headerAndIter); + Iterator iter = headerAndIter.iterator; + + // Filter out reads below a map quality threshold. This removes non-primary reads. + iter = new MapQualityFilteredIterator(iter, readQuality, true).iterator(); + + // If any tag value is set to the missing value (default "-") remove the read. + /* + if (TAG_MISSING_VALUE!=null) { + for (String tag: requiredTags) + iter = new TagValueFilteringIterator(iter, tag, List.of(TAG_MISSING_VALUE), false); + } + */ + + if (this.REMOVE_TAG_SUFFIX!=null) { + for (String tag: requiredTags) + iter = new BAMTagCleanupIterator.Builder(iter) + .tag(tag) + .suffixToRemove(REMOVE_TAG_SUFFIX) + .build(); + } + + // Don't filter out reads without tags. Missing tags are considered discordant. + // iter = new MissingTagFilteringIterator(iter, requiredTags.toArray(new String[0])); + + // if there are suffixes on the read names /1 and /2, remove them. + Pattern pattern = Pattern.compile("/[12]$"); + iter = new ReadNameCleanupIterator.Builder(iter) + .patternToRemove(pattern) + .build(); + + // Queryname sort data if it is not already sorted by queryname. This will spill to disk. + // if the data was already in queryname order, it can be streamed and filtered directly. + if (!headerAndIter.header.getSortOrder().equals(SAMFileHeader.SortOrder.queryname)) { + log.info("Input SAM/BAM not in queryname order, sorting..."); + final ProgressLogger progressLogger = new ProgressLogger(log, 1000000, "Sorting reads in query name order"); + iter = SamRecordSortingIteratorFactory.create(headerAndIter.header, iter, READ_NAME_COMPARATOR, progressLogger); + } + + final GroupingIterator groupingIterator = new GroupingIterator<>(iter, READ_NAME_COMPARATOR); + PeekableIterator> peekableIterator = new PeekableIterator<>(groupingIterator); + return peekableIterator; + } + + /** + * A comparator that sorts reads by query name. + * @return A metrics file. + */ + static final Comparator READ_NAME_COMPARATOR = new Comparator() { + private final SAMRecordQueryNameComparator comp = new SAMRecordQueryNameComparator(); + @Override + public int compare(final SAMRecord s1, final SAMRecord s2) { + return comp.fileOrderCompare(s1, s2); + } + }; + + static final Comparator QUERYNAME_COMPARATOR = new Comparator() { + private final SAMRecordQueryNameComparator comp = new SAMRecordQueryNameComparator(); + @Override + public int compare(final SAMRecord s1, final SAMRecord s2) { + return comp.compare(s1, s2); + } + }; + + /** + * A comparator sorts by first of pair/second of pair. + */ + static final Comparator PAIRED_READ_ORDER_COMPARATOR = new Comparator() { + private final SAMRecordQueryNameComparator comp = new SAMRecordQueryNameComparator(); + @Override + public int compare(final SAMRecord s1, final SAMRecord s2) { + boolean r1Paired = s1.getReadPairedFlag(); + boolean r2Paired = s2.getReadPairedFlag(); + if (r1Paired || r2Paired) { + if (!r1Paired) { + return 1; + } + + if (!r2Paired) { + return -1; + } + + if (s1.getFirstOfPairFlag() && s2.getSecondOfPairFlag()) { + return -1; + } + + if (s1.getSecondOfPairFlag() && s2.getFirstOfPairFlag()) { + return 1; + } + } + return 0; + } + }; + + + + public enum CompressionMode { + NO_COMPRESSION(false, false), + FIXED_LENGTH_COMPRESSION(false, true), + VARYING_LENGTH_COMPRESSION(true, false); + + private final boolean useBaseCompression; + private final boolean useFixedLengthBaseCompression; + + // Constructor + CompressionMode(boolean useBaseCompression, boolean useFixedLengthBaseCompression) { + this.useBaseCompression = useBaseCompression; + this.useFixedLengthBaseCompression = useFixedLengthBaseCompression; + } + + // Method to determine the mode based on flags + public static CompressionMode fromFlags(boolean useBaseCompression, boolean useFixedLengthBaseCompression) { + for (CompressionMode mode : CompressionMode.values()) { + if (mode.useBaseCompression == useBaseCompression && mode.useFixedLengthBaseCompression == useFixedLengthBaseCompression) { + return mode; + } + } + throw new IllegalArgumentException("Invalid flag combination"); + } + } + + /** + * A custom comparator that sorts concatenated keys by their components. + * Use this to sort concatonated tag values by their components consistently to the byte array representation. + */ + public static class CustomKeyComparator implements Comparator { + public static final CustomKeyComparator INSTANCE = new CustomKeyComparator(); + + private CustomKeyComparator() {} + + @Override + public int compare(String key1, String key2) { + // Split and preprocess the keys + String[] components1 = splitAndReplaceNulls(key1); + String[] components2 = splitAndReplaceNulls(key2); + + // Compare component by component + int len = Math.min(components1.length, components2.length); + for (int i = 0; i < len; i++) { + String comp1 = components1[i]; + String comp2 = components2[i]; + + // Handle "NA" and "null" values (sort them before any alphanumerics) + if (comp1.equals("NA") && !comp2.equals("NA")) { + return -1; + } + if (!comp1.equals("NA") && comp2.equals("NA")) { + return 1; + } + + // Compare non-"NA" components lexicographically + int cmp = comp1.compareTo(comp2); + if (cmp != 0) { + return cmp; + } + } + + // If all compared components are equal, the shorter key is smaller + return Integer.compare(components1.length, components2.length); + } + + /** + * Splits the concatenated key by ":" and replaces any null or "null" substrings with "NA". + * + * @param key The concatenated key. + * @return An array of substrings with "null" values replaced by "NA". + */ + private String[] splitAndReplaceNulls(String key) { + if (key == null) { + return new String[]{"NA"}; // Handle completely null key + } + return Arrays.stream(key.split(":")) + .map(component -> component == null || component.equals("null") ? "NA" : component) + .toArray(String[]::new); + } + } + + + + protected String[] customCommandLineValidation() { + + final ArrayList list = new ArrayList<>(1); + + + if (this.TAGS_1.size()!=this.TAGS_2.size()) + list.add("TAGS_1 and TAGS_2 must be the same length."); + + if (BAM_OUTPUT_1 != null) + IOUtil.assertFileIsWritable(this.BAM_OUTPUT_1); + + if (BAM_OUTPUT_2 != null) + IOUtil.assertFileIsWritable(this.BAM_OUTPUT_2); + + if (READ_COUNT_OUTPUT != null) + IOUtil.assertFileIsWritable(this.READ_COUNT_OUTPUT); + + if (TAG_VALUES_OUTPUT != null) + IOUtil.assertFileIsWritable(this.TAG_VALUES_OUTPUT); + + return CustomCommandLineValidationHelper.makeValue(super.customCommandLineValidation(), list); } /** Stock main method. */ @@ -116,6 +787,4 @@ public static void main(final String[] args) { System.exit(new CompareBAMTagValues().instanceMain(args)); } - - } diff --git a/src/java/org/broadinstitute/dropseqrna/utils/DNACompressor.java b/src/java/org/broadinstitute/dropseqrna/utils/DNACompressor.java new file mode 100644 index 00000000..bfb8c27c --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/utils/DNACompressor.java @@ -0,0 +1,222 @@ +package org.broadinstitute.dropseqrna.utils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; + +/** + * This compresses and decompresses DNA sequences into a byte array. + * Each sequence is compressed into a byte array, where each base is represented by 2 bits. + * The first byte of the compressed data contains the remainder of the sequence length divided by 4. + * The remaining bytes contain the bases packed into 2 bits each. + * The decompressed sequence is reconstructed by extracting the bases from the packed bytes. + + * The key difference between this version and DNACompressorVaryingLengths is that this version + * does not store the lengths of the original sequences in the compressed data, which can save space if the lengths are short. + * + */ +public class DNACompressor { + + private static final int BITS_PER_BASE = 3; + private static final int BASES_PER_BYTE = 8 / BITS_PER_BASE; + + /** + * Compresses a list of DNA strings into a single byte array, including a leading bitmask for null values. + * + * @param dnaList A list of DNA strings to compress. Each string must contain only 'A', 'C', 'G', 'T' or be null. + * @return A byte array representing the compressed DNA sequences with a leading bitmask. + */ + public static byte[] compressList(List dnaList) { + int numEntries = dnaList.size(); + int bitmaskSize = (numEntries + 7) / 8; // 1 bit per entry, rounded up to the nearest byte + byte[] bitmask = new byte[bitmaskSize]; + + List compressedSequences = new ArrayList<>(); + int totalCompressedSize = bitmaskSize; // Include the bitmask size + + // Create the bitmask and compress non-null sequences + for (int i = 0; i < numEntries; i++) { + if (dnaList.get(i) == null) { + // Mark null entries in the bitmask + bitmask[i / 8] |= (byte) (1 << (i % 8)); + } else { + // Compress non-null sequences + byte[] compressed = compress(dnaList.get(i)); + compressedSequences.add(compressed); + totalCompressedSize += compressed.length; + } + } + + // Pack the bitmask and compressed data into a single byte array + byte[] result = new byte[totalCompressedSize]; + System.arraycopy(bitmask, 0, result, 0, bitmaskSize); + + int offset = bitmaskSize; + for (byte[] compressed : compressedSequences) { + System.arraycopy(compressed, 0, result, offset, compressed.length); + offset += compressed.length; + } + + return result; + } + + + /** + * Decompresses a single byte array into a list of DNA strings, including handling null values using the bitmask. + * + * @param compressed The byte array containing the bitmask and compressed DNA sequences. + * @param lengths An array of lengths of the original DNA sequences. + * @return A list of decompressed DNA strings, including null values. + */ + /** + * Decompresses a single byte array into a list of DNA strings, including handling null values using the bitmask. + * + * @param compressed The byte array containing the bitmask and compressed DNA sequences. + * @param lengths An array of lengths of the original DNA sequences. + * @return A list of decompressed DNA strings, including null values. + */ + public static List decompressList(byte[] compressed, int[] lengths) { + int numEntries = lengths.length; + int bitmaskSize = (numEntries + 7) / 8; + + // Extract the bitmask + byte[] bitmask = new byte[bitmaskSize]; + System.arraycopy(compressed, 0, bitmask, 0, bitmaskSize); + + List decompressed = new ArrayList<>(); + int offset = bitmaskSize; + + // Decompress each sequence based on the bitmask + for (int i = 0; i < numEntries; i++) { + if ((bitmask[i / 8] & (1 << (i % 8))) != 0) { + // Null value indicated by the bitmask + decompressed.add(null); + } else { + // Calculate byte length and remainder for the sequence + int length = lengths[i]; + int[] calc = calculateByteLengthAndRemainder(length); + int byteLength = calc[0]; + + // Extract the compressed data for this sequence + byte[] sequenceData = new byte[1 + byteLength]; + System.arraycopy(compressed, offset, sequenceData, 0, 1 + byteLength); + + // Decompress and add to the list + decompressed.add(decompress(sequenceData)); + offset += (1 + byteLength); + } + } + return decompressed; + } + + /** + * Compresses a single DNA string into a byte array, supporting 'A', 'C', 'G', 'T', and 'N'. + */ + public static byte[] compress(String dna) { + int[] calc = calculateByteLengthAndRemainder(dna.length()); + int byteLength = calc[0]; + int remainder = calc[1]; + + byte[] compressed = new byte[1 + byteLength]; + compressed[0] = (byte) remainder; + + for (int i = 0; i < dna.length(); i++) { + int base = switch (dna.charAt(i)) { + case 'A' -> 0b000; + case 'C' -> 0b001; + case 'G' -> 0b010; + case 'T' -> 0b011; + case 'N' -> 0b100; + default -> throw new IllegalArgumentException("Invalid base: " + dna.charAt(i)); + }; + + int byteIndex = 1 + (i / BASES_PER_BYTE); + int bitPosition = (BASES_PER_BYTE - 1 - (i % BASES_PER_BYTE)) * BITS_PER_BASE; + compressed[byteIndex] |= (byte) (base << bitPosition); + } + + return compressed; + } + + /** + * Decompresses a single compressed DNA sequence back into a string, supporting 'A', 'C', 'G', 'T', and 'N'. + */ + public static String decompress(byte[] compressed) { + int remainder = compressed[0] & 0xFF; + int totalBases = (compressed.length - 1) * BASES_PER_BYTE - ((BASES_PER_BYTE - remainder) % BASES_PER_BYTE); + + StringBuilder dna = new StringBuilder(totalBases); + + for (int i = 0; i < totalBases; i++) { + int byteIndex = 1 + (i / BASES_PER_BYTE); + int bitPosition = (BASES_PER_BYTE - 1 - (i % BASES_PER_BYTE)) * BITS_PER_BASE; + int base = (compressed[byteIndex] >> bitPosition) & 0b111; + + char nucleotide = switch (base) { + case 0b000 -> 'A'; + case 0b001 -> 'C'; + case 0b010 -> 'G'; + case 0b011 -> 'T'; + case 0b100 -> 'N'; + default -> throw new IllegalStateException("Invalid base bits: " + base); + }; + dna.append(nucleotide); + } + + return dna.toString(); + } + + /** + * Calculates the byte length and remainder for a given DNA sequence length. + * + * @param sequenceLength The length of the DNA sequence. + * @return An array where [0] is the byte length and [1] is the remainder. + */ + private static int[] calculateByteLengthAndRemainder(int sequenceLength) { + int remainder = sequenceLength % BASES_PER_BYTE; + int byteLength = (sequenceLength + BASES_PER_BYTE - 1) / BASES_PER_BYTE; + return new int[]{byteLength, remainder}; + } + + + /** + * Compares two compressed DNA outputs (byte arrays) lexicographically, + * skipping the bitmask portion of the array. + * + * @param lengths The lengths of the entries in the compressed data. + * @return Comparator for comparing compressed DNA outputs. + */ + public static Comparator getDecompressedComparator(int[] lengths) { + return (a, b) -> { + // Decompress both byte arrays into lists of strings + List listA = decompressList(a, lengths); + List listB = decompressList(b, lengths); + + // Compare the decompressed lists lexicographically + int len = Math.min(listA.size(), listB.size()); + for (int i = 0; i < len; i++) { + String valueA = listA.get(i); + String valueB = listB.get(i); + + // Null values are treated as smaller than non-null values + if (valueA == null && valueB == null) { + continue; // Equal + } else if (valueA == null) { + return -1; // Null < Non-null + } else if (valueB == null) { + return 1; // Non-null > Null + } + + // Compare non-null strings + int cmp = valueA.compareTo(valueB); + if (cmp != 0) { + return cmp; // Return as soon as a difference is found + } + } + + // If all compared strings are equal, the shorter list is smaller + return Integer.compare(listA.size(), listB.size()); + }; + } +} diff --git a/src/java/org/broadinstitute/dropseqrna/utils/DNACompressorVaryingLengths.java b/src/java/org/broadinstitute/dropseqrna/utils/DNACompressorVaryingLengths.java new file mode 100644 index 00000000..702eef8f --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/utils/DNACompressorVaryingLengths.java @@ -0,0 +1,165 @@ +package org.broadinstitute.dropseqrna.utils; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +/** + * The `DNACompressor` class provides utilities for compressing and decompressing DNA sequences. + * + * Compression: + * - DNA sequences are represented using 2 bits per nucleotide ('A', 'C', 'G', 'T'). + * - Compressed data is stored in a single byte array. + * - The metadata includes: + * - Number of sequences (4 bytes). + * - Length of each compressed sequence (4 bytes per sequence). + * - Compressed DNA data concatenated sequentially. + * + * Decompression: + * - Reads the metadata to determine the number and lengths of sequences. + * - Extracts the compressed DNA data using the length information. + * - Reconstructs the original DNA sequences from the compressed representation. + * + * Use Cases: + * - Efficient storage and retrieval of DNA sequences of varying lengths. + * - Eliminates the need for external length information by embedding metadata. + * - This does not handle null strings or N bases. + * + * If you know your sequences are of fixed length, you can use `DNACompressor` instead. + * For example if your sequuences were of length 16,16,10,10 this solution would use 34 bytes, and + * `DNACompressor` would use 14 bytes. + */ + +public class DNACompressorVaryingLengths { + + /** + * Compresses a list of DNA strings into a single byte array. + * + * @param dnaList A list of DNA strings to compress. Each string must contain only 'A', 'C', 'G', 'T'. + * @return A byte array representing the compressed DNA sequences, including metadata for lengths. + */ + /* + public static byte[] compressList(List dnaList) { + List compressedSequences = new ArrayList<>(); + int totalSize = 4; // 4 bytes for storing the number of sequences + + // Compress each sequence and calculate total size + for (String dna : dnaList) { + byte[] compressed = compress(dna); + compressedSequences.add(compressed); + totalSize += 4 + compressed.length; // 4 bytes for length + compressed data + } + + // Pack all compressed data into a single byte array + ByteBuffer buffer = ByteBuffer.allocate(totalSize); + buffer.putInt(dnaList.size()); // Number of sequences + + for (byte[] compressed : compressedSequences) { + buffer.putInt(compressed.length); // Length of each compressed sequence + buffer.put(compressed); // Compressed data + } + + return buffer.array(); + } + */ + /** + * Decompresses a single byte array into a list of DNA strings. + * + * @param compressed The byte array containing compressed DNA sequences with metadata. + * @return A list of decompressed DNA strings. + */ + /* + public static List decompressList(byte[] compressed) { + ByteBuffer buffer = ByteBuffer.wrap(compressed); + + // Read the number of sequences + int numSequences = buffer.getInt(); + List decompressed = new ArrayList<>(); + + // Decompress each sequence + for (int i = 0; i < numSequences; i++) { + int length = buffer.getInt(); // Length of the compressed sequence + byte[] sequenceData = new byte[length]; + buffer.get(sequenceData); // Extract the compressed sequence + decompressed.add(decompress(sequenceData)); + } + + return decompressed; + } + */ + + /** + * Compresses a single DNA string into a byte array. + */ + /* + public static byte[] compress(String dna) { + int length = dna.length(); + int remainder = length % 4; + int byteLength = (length + 3) / 4; + + byte[] compressed = new byte[1 + byteLength]; + compressed[0] = (byte) remainder; + + for (int i = 0; i < length; i++) { + int base = switch (dna.charAt(i)) { + case 'A' -> 0b00; + case 'C' -> 0b01; + case 'G' -> 0b10; + case 'T' -> 0b11; + default -> throw new IllegalArgumentException("Invalid base: " + dna.charAt(i)); + }; + + int byteIndex = 1 + (i / 4); + int bitPosition = (3 - (i % 4)) * 2; + compressed[byteIndex] |= (base << bitPosition); + } + + return compressed; + } + */ + /** + * Decompresses a single compressed DNA sequence back into a string. + */ + /* + public static String decompress(byte[] compressed) { + int remainder = compressed[0] & 0xFF; + int byteLength = compressed.length - 1; + int totalBases = (byteLength * 4) - ((4 - remainder) % 4); + + StringBuilder dna = new StringBuilder(totalBases); + + for (int i = 0; i < totalBases; i++) { + int byteIndex = 1 + (i / 4); + int bitPosition = (3 - (i % 4)) * 2; + int base = (compressed[byteIndex] >> bitPosition) & 0b11; + + char nucleotide = switch (base) { + case 0b00 -> 'A'; + case 0b01 -> 'C'; + case 0b10 -> 'G'; + case 0b11 -> 'T'; + default -> throw new IllegalStateException("Invalid base bits: " + base); + }; + dna.append(nucleotide); + } + + return dna.toString(); + } + + public static void main(String[] args) { + // Example DNA sequences + List dnaList = List.of("ACGT", "TTACG", "GCGTACGTAC"); + System.out.println("Original DNA List: " + dnaList); + + // Compress the DNA sequences + byte[] compressed = compressList(dnaList); + System.out.println("Compressed Data: " + java.util.Arrays.toString(compressed)); + + // Decompress back into the original DNA sequences + List decompressed = decompressList(compressed); + System.out.println("Decompressed DNA List: " + decompressed); + + // Verify that the decompressed sequences match the originals + System.out.println("Matches original: " + dnaList.equals(decompressed)); + } + */ +} diff --git a/src/java/org/broadinstitute/dropseqrna/utils/TransformingIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/TransformingIterator.java index c29e4d29..cb04b731 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/TransformingIterator.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/TransformingIterator.java @@ -60,8 +60,6 @@ public void remove() { @Override public void close() { CloserUtil.close(this.underlyingIterator); - // TODO Auto-generated method stub - } } diff --git a/src/java/org/broadinstitute/dropseqrna/utils/alignmentcomparison/QueryNameJointIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/alignmentcomparison/QueryNameJointIterator.java index c6d8f655..1cd64f39 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/alignmentcomparison/QueryNameJointIterator.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/alignmentcomparison/QueryNameJointIterator.java @@ -12,8 +12,10 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordQueryNameComparator; +import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.PeekableIterator; +import org.broadinstitute.dropseqrna.utils.SamWriterSink; import java.util.List; @@ -33,20 +35,49 @@ public class QueryNameJointIterator { private final PeekableIterator> iterTwo; private final SAMRecordQueryNameComparator comp; + private SamWriterSink sink1; + private SamWriterSink sink2; + private JointResult next = null; + public final QueryNameJointIteratorMetrics metrics; public QueryNameJointIterator(final PeekableIterator> iterOne, final PeekableIterator> iterTwo) { + this.metrics = new QueryNameJointIteratorMetrics(); this.comp = new SAMRecordQueryNameComparator(); this.iterOne = iterOne; this.iterTwo = iterTwo; getNextSet(); } + /** + * Adds reads sinks to the iterator. For reads that are skipped by the iterator, they are written to the sink. + * These sinks are closed when the iterator is exhausted. + * @param sink1 The sink for reads that are skipped by the first iterator. + * @param sink2 The sink for reads that are skipped by the second iterator. + */ + public void addReadSinks (SamWriterSink sink1, SamWriterSink sink2) { + this.sink1=sink1; + this.sink2=sink2; + } + + public QueryNameJointIteratorMetrics getMetrics () { + return this.metrics; + } public boolean hasNext() { if (this.next == null) getNextSet(); // iterates until you have a result, or you're out of results. - return this.next != null; + boolean hasResult = this.next != null; + // automatically close the sinks if they exist and there are no more results. + if (!hasResult) { + if (this.sink1!=null) { + sink1.writer.close(); + } + if (this.sink2!=null) { + sink2.writer.close(); + } + } + return hasResult; } /** @@ -68,19 +99,31 @@ public JointResult next() { private void getNextSet() { while (iterOne.hasNext() && iterTwo.hasNext()) { - List r1List = iterOne.peek(); List r2List = iterTwo.peek(); // only have to compare the first record. int cmp = comp.fileOrderCompare(r1List.get(0), r2List.get(0)); // log.info("R1: "+ r1List.toString()+ " R2: " +r2List.toString()); - if (cmp < 0) + if (cmp < 0) { + this.metrics.READ_ONE++; r1List = iterOne.next(); - else if (cmp > 0) + if (this.sink1!=null) { + for (SAMRecord r: r1List) + this.sink1.add(r); + } + } + else if (cmp > 0) { + this.metrics.READ_TWO++; r2List = iterTwo.next(); + if (this.sink2!=null) { + for (SAMRecord r: r2List) + this.sink2.add(r); + } + } else if (cmp == 0) { // do some real work. // grab the next record and process it. + metrics.BOTH++; r1List = iterOne.next(); r2List = iterTwo.next(); JointResult jr = new JointResult(r1List, r2List); @@ -113,4 +156,14 @@ public List getTwo() { } } + + public class QueryNameJointIteratorMetrics extends MetricBase { + public int READ_ONE = 0; + public int READ_TWO = 0; + public int BOTH=0; + + public boolean hasDisjointReads() { + return READ_ONE != 0 || READ_TWO != 0; + } + } } diff --git a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/BAMTagCleanupIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/BAMTagCleanupIterator.java new file mode 100644 index 00000000..adee67e7 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/BAMTagCleanupIterator.java @@ -0,0 +1,111 @@ +package org.broadinstitute.dropseqrna.utils.readiterators; + +import java.util.Iterator; +import java.util.regex.Pattern; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.dropseqrna.utils.TransformingIterator; + +public class BAMTagCleanupIterator extends TransformingIterator { + + private final String tag; + private final String prefixToRemove; + private final String prefixToAdd; + private final String suffixToRemove; + private final String suffixToAdd; + private final Pattern patternToRemove; + + public BAMTagCleanupIterator(Iterator underlyingIterator, String tag, final String prefixToRemove, final String prefixToAdd, String suffixToRemove, String suffixToAdd, Pattern patternToRemove) { + super(underlyingIterator); + this.tag = tag; + this.prefixToRemove = prefixToRemove; + this.prefixToAdd = prefixToAdd; + this.suffixToRemove = suffixToRemove; + this.suffixToAdd = suffixToAdd; + this.patternToRemove = patternToRemove; + } + + @Override + public SAMRecord next() { + SAMRecord r = this.underlyingIterator.next(); + String tagValue = r.getStringAttribute(tag); + + if (tagValue != null) { + // Remove prefix if it exists + if (prefixToRemove != null && tagValue.startsWith(prefixToRemove)) { + tagValue = tagValue.substring(prefixToRemove.length()); + } + + // Add prefix + if (prefixToAdd != null) { + tagValue = prefixToAdd + tagValue; + } + + // Remove suffix if it exists + if (suffixToRemove != null && tagValue.endsWith(suffixToRemove)) { + tagValue = tagValue.substring(0, tagValue.length() - suffixToRemove.length()); + } + + // Add suffix + if (suffixToAdd != null) { + tagValue = tagValue + suffixToAdd; + } + + // Remove pattern if it exists + if (patternToRemove != null) { + tagValue = patternToRemove.matcher(tagValue).replaceAll(""); + } + + r.setAttribute(tag, tagValue); + } + + return r; + } + + public static class Builder { + private final Iterator underlyingIterator; + private String tag; + private String prefixToRemove; + private String prefixToAdd; + private String suffixToRemove; + private String suffixToAdd; + private Pattern patternToRemove; + + public Builder(Iterator underlyingIterator) { + this.underlyingIterator = underlyingIterator; + } + + public Builder tag(String tag) { + this.tag = tag; + return this; + } + + public Builder prefixToRemove(String prefixToRemove) { + this.prefixToRemove = prefixToRemove; + return this; + } + + public Builder prefixToAdd(String prefixToAdd) { + this.prefixToAdd = prefixToAdd; + return this; + } + + public Builder suffixToRemove(String suffixToRemove) { + this.suffixToRemove = suffixToRemove; + return this; + } + + public Builder suffixToAdd(String suffixToAdd) { + this.suffixToAdd = suffixToAdd; + return this; + } + + public Builder patternToRemove(Pattern patternToRemove) { + this.patternToRemove = patternToRemove; + return this; + } + + public BAMTagCleanupIterator build() { + return new BAMTagCleanupIterator(this.underlyingIterator, this.tag, this.prefixToRemove, this.prefixToAdd, this.suffixToRemove, this.suffixToAdd, this.patternToRemove); + } + } +} \ No newline at end of file diff --git a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/ReadNameCleanupIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/ReadNameCleanupIterator.java new file mode 100644 index 00000000..461a8522 --- /dev/null +++ b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/ReadNameCleanupIterator.java @@ -0,0 +1,100 @@ +package org.broadinstitute.dropseqrna.utils.readiterators; + +import java.util.Iterator; +import java.util.regex.Pattern; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.dropseqrna.utils.TransformingIterator; + +public class ReadNameCleanupIterator extends TransformingIterator { + + private final String prefixToRemove; + private final String prefixToAdd; + private final String suffixToRemove; + private final String suffixToAdd; + private final Pattern patternToRemove; + + public ReadNameCleanupIterator(Iterator underlyingIterator, final String prefixToRemove, final String prefixToAdd, String suffixToRemove, String suffixToAdd, Pattern patternToRemove) { + super(underlyingIterator); + this.prefixToRemove = prefixToRemove; + this.prefixToAdd = prefixToAdd; + this.suffixToRemove = suffixToRemove; + this.suffixToAdd = suffixToAdd; + this.patternToRemove = patternToRemove; + } + + @Override + public SAMRecord next() { + SAMRecord r = this.underlyingIterator.next(); + String readName = r.getReadName(); + + // Remove prefix if it exists + if (prefixToRemove != null && readName.startsWith(prefixToRemove)) { + readName = readName.substring(prefixToRemove.length()); + } + + // Add prefix + if (prefixToAdd != null) { + readName = prefixToAdd + readName; + } + + // Remove suffix if it exists + if (suffixToRemove != null && readName.endsWith(suffixToRemove)) { + readName = readName.substring(0, readName.length() - suffixToRemove.length()); + } + + // Add suffix + if (suffixToAdd != null) { + readName = readName + suffixToAdd; + } + + // Remove pattern if it exists + if (patternToRemove != null) { + readName = patternToRemove.matcher(readName).replaceAll(""); + } + + r.setReadName(readName); + return r; + } + + public static class Builder { + private final Iterator underlyingIterator; + private String prefixToRemove; + private String prefixToAdd; + private String suffixToRemove; + private String suffixToAdd; + private Pattern patternToRemove; + + public Builder(Iterator underlyingIterator) { + this.underlyingIterator = underlyingIterator; + } + + public Builder prefixToRemove(String prefixToRemove) { + this.prefixToRemove = prefixToRemove; + return this; + } + + public Builder prefixToAdd(String prefixToAdd) { + this.prefixToAdd = prefixToAdd; + return this; + } + + public Builder suffixToRemove(String suffixToRemove) { + this.suffixToRemove = suffixToRemove; + return this; + } + + public Builder suffixToAdd(String suffixToAdd) { + this.suffixToAdd = suffixToAdd; + return this; + } + + public Builder patternToRemove(Pattern patternToRemove) { + this.patternToRemove = patternToRemove; + return this; + } + + public ReadNameCleanupIterator build() { + return new ReadNameCleanupIterator(this.underlyingIterator, this.prefixToRemove, this.prefixToAdd, this.suffixToRemove, this.suffixToAdd, this.patternToRemove); + } + } +} \ No newline at end of file diff --git a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/TagValueFilteringIterator.java b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/TagValueFilteringIterator.java index 9ee0a22b..780b0b1f 100644 --- a/src/java/org/broadinstitute/dropseqrna/utils/readiterators/TagValueFilteringIterator.java +++ b/src/java/org/broadinstitute/dropseqrna/utils/readiterators/TagValueFilteringIterator.java @@ -12,38 +12,56 @@ import htsjdk.samtools.util.Log; /** - * Filters out a read if the tag value does not match an one of an expected set of values or is not set. + * If retainExpectedValues is true, then filters out a read if the tag value does not match an one of an expected set of values or is not set. * If the expected set is null or empty, no reads are filtered. + * + * If retainExpectedValues is false, reject reads that are in the expected values. + * * @author nemesh * */ public class TagValueFilteringIterator extends FilteredIterator { final short requiredTag; final Set expectedValues; + private final boolean retainExpectedValues; private static final Log log = Log.getInstance(TagValueFilteringIterator.class); - - public TagValueFilteringIterator(final Iterator underlyingIterator, final String requiredTag, final Collection expectedValues) { - super(underlyingIterator); - this.requiredTag = SAMTag.makeBinaryTag(requiredTag); - this.expectedValues = new HashSet(expectedValues); - } + /** + * The default behavior is to retain reads where the tag is not set or has a value in the accepted set. + * @param underlyingIterator The iterator to filter + * @param requiredTag The tag to filter on + * @param expectedValues The values to filter on + */ + public TagValueFilteringIterator(final Iterator underlyingIterator, final String requiredTag, final Collection expectedValues) { + this(underlyingIterator, requiredTag, expectedValues, true); + } + public TagValueFilteringIterator(final Iterator underlyingIterator, final String requiredTag, final Collection expectedValues, boolean retainExpectedValues) { + super(underlyingIterator); + this.requiredTag = SAMTag.makeBinaryTag(requiredTag); + this.expectedValues = new HashSet(expectedValues); + this.retainExpectedValues=retainExpectedValues; + } @Override public boolean filterOut(final SAMRecord rec) { Object value = rec.getAttribute(requiredTag); if (value == null) - return true; + return retainExpectedValues; if (this.expectedValues.contains(value)) - return false; - return true; + return !retainExpectedValues; + return retainExpectedValues; } @Override - public void logFilterResults() { - String msg = String.format("Required Tag [%s] Records pass [%d] records fail [%d] ",SAMTag.makeStringTag(requiredTag), this.getRecordsPassed(), this.getRecordsFailed()); + public void logFilterResults() { + if (this.retainExpectedValues) { + String msg = String.format("Required Tag [%s] Records pass [%d] records fail [%d] ",SAMTag.makeStringTag(requiredTag), this.getRecordsPassed(), this.getRecordsFailed()); + log.info(msg); + return; + } + String msg = String.format("Rejected Tag [%s] Records pass [%d] records fail [%d] ",SAMTag.makeStringTag(requiredTag), this.getRecordsPassed(), this.getRecordsFailed()); log.info(msg); } } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrorsTest.java b/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrorsTest.java index c3055f0b..bc8ffd4f 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrorsTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/beadsynthesis/DetectBeadSynthesisErrorsTest.java @@ -25,16 +25,14 @@ import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; +import java.util.*; import org.apache.commons.io.FileUtils; import org.broadinstitute.dropseqrna.TranscriptomeException; import org.broadinstitute.dropseqrna.utils.CompareBAMTagValues; import org.testng.Assert; import org.testng.annotations.Test; +import picard.nio.PicardHtsPath; public class DetectBeadSynthesisErrorsTest { @@ -84,16 +82,9 @@ public void testDoWork() { e.printStackTrace(); } - // test BAM - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=EXPECTED_BAM; - cbtv.INPUT_2=cleanBAM; - List tags = new ArrayList<>(); - tags.add("XC"); - cbtv.TAGS=tags; - int r = cbtv.doWork(); - Assert.assertTrue(r==0); - + // test BAM, + List tags = Collections.singletonList("XC"); + compareBAMTagValues(EXPECTED_BAM, cleanBAM, tags, 0); @@ -287,6 +278,15 @@ public void testCustomCommandParsing2 () { Assert.assertTrue(errors.length>0); } - + private void compareBAMTagValues(File input1, File input2, List tags, int expectedProgramValue) { + CompareBAMTagValues cbtv = new CompareBAMTagValues(); + cbtv.INPUT_1 = Collections.singletonList(new PicardHtsPath(input1)); + cbtv.INPUT_2 = Collections.singletonList(new PicardHtsPath(input2)); + cbtv.TAGS_1 = tags; + cbtv.TAGS_2 = tags; + cbtv.STRICT = true; + int result = cbtv.doWork(); + Assert.assertTrue(result == expectedProgramValue); + } } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneExonFunctionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneExonFunctionTest.java index cbe3bff9..64c18679 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneExonFunctionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneExonFunctionTest.java @@ -21,6 +21,7 @@ import htsjdk.samtools.util.OverlapDetector; import picard.annotation.Gene; import picard.annotation.LocusFunction; +import picard.nio.PicardHtsPath; public class TagReadWithGeneExonFunctionTest { @@ -41,16 +42,11 @@ public void testDoWork() { t.ANNOTATIONS_FILE=annotationsFile; t.SUMMARY=tempSummary; int returnVal = t.doWork(); - Assert.assertTrue(returnVal==0); + Assert.assertEquals(returnVal, 0); // test output BAM - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=testBAMFile; - cbtv.INPUT_2=tempBAM; List tags = new ArrayList<>(Arrays.asList("XC", "GE", "GS", "XF")); - cbtv.TAGS=tags; - int r = cbtv.doWork(); - Assert.assertTrue(r==0); + compareBAMTagValues(testBAMFile, tempBAM, tags, 0); } @@ -131,8 +127,8 @@ public void testIntergeicRead () { TagReadWithGeneExonFunction tagger = new TagReadWithGeneExonFunction(); r=tagger.setAnnotations(r, geneOverlapDetector); - Assert.assertEquals(r.getStringAttribute("GE"), null); - Assert.assertEquals(r.getStringAttribute("GS"), null); + Assert.assertNull(r.getStringAttribute("GE")); + Assert.assertNull(r.getStringAttribute("GS")); Assert.assertEquals(r.getStringAttribute("XF"), LocusFunction.INTERGENIC.name()); } @@ -664,5 +660,17 @@ private File getTempReportFile (final String prefix, final String suffix) { return tempFile; } + private void compareBAMTagValues(File input1, File input2, List tags, int expectedProgramValue) { + CompareBAMTagValues cbtv = new CompareBAMTagValues(); + cbtv.INPUT_1 = Collections.singletonList(new PicardHtsPath(input1)); + cbtv.INPUT_2 = Collections.singletonList(new PicardHtsPath(input2)); + cbtv.TAGS_1 = tags; + cbtv.TAGS_2 = tags; + cbtv.STRICT = true; + int result = cbtv.doWork(); + Assert.assertEquals(expectedProgramValue, result); + } + + } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneFunctionTest.java b/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneFunctionTest.java index a9cf5b85..ee1904d2 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneFunctionTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithGeneFunctionTest.java @@ -46,13 +46,9 @@ public void testDoWork() throws IOException { Assert.assertTrue(returnVal==0); // test output BAM - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=OUT_BAM; - cbtv.INPUT_2=tempBAM; List tags = new ArrayList<>(Arrays.asList("XC", "gn", "gs", "gf", "XF")); - cbtv.TAGS=tags; - int r = cbtv.doWork(); - Assert.assertTrue(r==0); + compareBAMTagValues(OUT_BAM, tempBAM, tags, 0); + } @@ -601,4 +597,16 @@ private SAMRecord getRecord (final File bamFile, final String recName) { return null; } + private void compareBAMTagValues(File input1, File input2, List tags, int expectedProgramValue) { + CompareBAMTagValues cbtv = new CompareBAMTagValues(); + cbtv.INPUT_1 = Collections.singletonList(new PicardHtsPath(input1)); + cbtv.INPUT_2 = Collections.singletonList(new PicardHtsPath(input2)); + cbtv.TAGS_1 = tags; + cbtv.TAGS_2 = tags; + cbtv.STRICT = true; + int result = cbtv.doWork(); + Assert.assertTrue(result == expectedProgramValue); + } + + } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithIntervalTest.java b/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithIntervalTest.java index 3a6f584d..c2e142a4 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithIntervalTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/metrics/TagReadWithIntervalTest.java @@ -3,6 +3,7 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import org.broadinstitute.dropseqrna.utils.CompareBAMTagValues; @@ -10,6 +11,7 @@ import org.testng.annotations.Test; import htsjdk.samtools.util.Interval; +import picard.nio.PicardHtsPath; public class TagReadWithIntervalTest { // any BAM will do. @@ -35,14 +37,9 @@ public void testDoWork() { int r = t.doWork(); Assert.assertTrue (r==0); - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=outBAM; - cbtv.INPUT_2=TAGGED_BAM; - List tags = new ArrayList<>(); - tags.add("ZI"); - cbtv.TAGS=tags; - int result = cbtv.doWork(); - Assert.assertTrue(result==0); + // compare the tagged BAM to the expected BAM. + List tags = Collections.singletonList("ZI"); + compareBAMTagValues(outBAM, TAGGED_BAM, tags, 0); } @@ -56,4 +53,16 @@ public void testGetIntervalName () { Assert.assertEquals(result, "foo,bar"); } + + private void compareBAMTagValues(File input1, File input2, List tags, int expectedProgramValue) { + CompareBAMTagValues cbtv = new CompareBAMTagValues(); + cbtv.INPUT_1 = Collections.singletonList(new PicardHtsPath(input1)); + cbtv.INPUT_2 = Collections.singletonList(new PicardHtsPath(input2)); + cbtv.TAGS_1 = tags; + cbtv.TAGS_2 = tags; + cbtv.STRICT = true; + int result = cbtv.doWork(); + Assert.assertTrue(result == expectedProgramValue); + } + } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapperTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapperTest.java new file mode 100644 index 00000000..4b227d2d --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/ByteArrayWrapperTest.java @@ -0,0 +1,60 @@ +package org.broadinstitute.dropseqrna.utils; + +import org.testng.annotations.Test; +import static org.testng.Assert.*; + +public class ByteArrayWrapperTest { + + @Test + public void testEquals() { + byte[] data1 = {1, 2, 3}; + byte[] data2 = {1, 2, 3}; + byte[] data3 = {4, 5, 6}; + + ByteArrayWrapper wrapper1 = new ByteArrayWrapper(data1); + ByteArrayWrapper wrapper2 = new ByteArrayWrapper(data2); + ByteArrayWrapper wrapper3 = new ByteArrayWrapper(data3); + + assertEquals(wrapper1, wrapper2); + assertNotEquals(wrapper1, wrapper3); + } + + @Test + public void testHashCode() { + byte[] data1 = {1, 2, 3}; + byte[] data2 = {1, 2, 3}; + byte[] data3 = {4, 5, 6}; + + ByteArrayWrapper wrapper1 = new ByteArrayWrapper(data1); + ByteArrayWrapper wrapper2 = new ByteArrayWrapper(data2); + ByteArrayWrapper wrapper3 = new ByteArrayWrapper(data3); + + assertEquals(wrapper1.hashCode(), wrapper2.hashCode()); + assertNotEquals(wrapper1.hashCode(), wrapper3.hashCode()); + } + + @Test + public void testCompareTo() { + byte[] data1 = {1, 2, 3}; + byte[] data2 = {1, 2, 3}; + byte[] data3 = {4, 5, 6}; + byte[] data4 = {1, 2, 4}; + + ByteArrayWrapper wrapper1 = new ByteArrayWrapper(data1); + ByteArrayWrapper wrapper2 = new ByteArrayWrapper(data2); + ByteArrayWrapper wrapper3 = new ByteArrayWrapper(data3); + ByteArrayWrapper wrapper4 = new ByteArrayWrapper(data4); + + assertEquals(wrapper1.compareTo(wrapper2), 0); + assertTrue(wrapper1.compareTo(wrapper3) < 0); + assertTrue(wrapper3.compareTo(wrapper1) > 0); + assertTrue(wrapper1.compareTo(wrapper4) < 0); + } + + @Test + public void testGetData() { + byte[] data = {1, 2, 3}; + ByteArrayWrapper wrapper = new ByteArrayWrapper(data); + assertEquals(wrapper.getData(), data); + } +} \ No newline at end of file diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValuesTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValuesTest.java new file mode 100644 index 00000000..bfe9b26c --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/CompareBAMTagValuesTest.java @@ -0,0 +1,223 @@ +package org.broadinstitute.dropseqrna.utils; + + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordSetBuilder; +import htsjdk.samtools.util.Log; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; +import picard.nio.PicardHtsPath; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertNull; + +public class CompareBAMTagValuesTest { + + private static final File TESTDATA_DIR = new File("testdata/org/broadinstitute/dropseq/utils"); + + private static final PicardHtsPath STARsolo = new PicardHtsPath(TESTDATA_DIR + "/compare_tags_STARsolo.sam"); + private static final PicardHtsPath CELLRANGER = new PicardHtsPath(TESTDATA_DIR + "/compare_tags_CellRanger.sam"); + private static final File test1Expected = new File(TESTDATA_DIR + "/testCompareBAMTagValuesUnpairedReads_report.txt"); + + // private static final PicardHtsPath STARsolo = new PicardHtsPath(TESTDATA_DIR+"/test1.sam"); + // private static final PicardHtsPath CELLRANGER = new PicardHtsPath(TESTDATA_DIR+"/test2.sam"); + + private static final PicardHtsPath DROPSEQ = new PicardHtsPath(TESTDATA_DIR + "/compare_tags_DropSeq.sam"); + private final Log log = Log.getInstance(CompareBAMTagValuesTest.class); + + @Test + // TEST 1. + public void testCompareBAMTagValuesUnpairedReads() { + CompareBAMTagValues c = new CompareBAMTagValues(); + c.INPUT_1 = Collections.singletonList(STARsolo); + c.INPUT_2 = Collections.singletonList(CELLRANGER); + c.TAGS_1 = Arrays.asList("CB", "CR"); + c.TAGS_2 = Arrays.asList("CB", "CR"); + c.BAM_OUTPUT_1 = TestUtils.getTempReportFile("compare_starsolo_cellranger_1.", ".bam"); + c.BAM_OUTPUT_2 = TestUtils.getTempReportFile("compare_starsolo_cellranger_2.", ".bam"); + c.BAM_UNIQUE_1 = TestUtils.getTempReportFile("starsolo_unique.", ".bam"); + c.BAM_UNIQUE_2 = TestUtils.getTempReportFile("cell_ranger_unique.", ".bam"); + c.TAG_VALUES_OUTPUT = TestUtils.getTempReportFile("compare_starsolo_cellranger", ".report"); + c.READ_COUNT_OUTPUT = TestUtils.getTempReportFile("compare_starsolo_cellranger", ".read_count"); + c.useFixedLengthBaseCompression = true; + c.STRICT = false; + c.BAM_OUTPUT_1.deleteOnExit(); + c.BAM_OUTPUT_2.deleteOnExit(); + c.BAM_UNIQUE_1.deleteOnExit(); + c.BAM_UNIQUE_2.deleteOnExit(); + c.TAG_VALUES_OUTPUT.deleteOnExit(); + c.READ_COUNT_OUTPUT.deleteOnExit(); + int result = c.doWork(); + log.info("Result file: " + c.TAG_VALUES_OUTPUT.getAbsolutePath()); + Assert.assertEquals(result, 0); + boolean outputSame = TestUtils.testFilesSame(test1Expected, c.TAG_VALUES_OUTPUT); + Assert.assertTrue(outputSame); + } + + @Test + // do not use fixed length base compression for the tags. + public void testCompareBAMTagValuesUnpairedReads2() { + CompareBAMTagValues c = new CompareBAMTagValues(); + c.INPUT_1 = Collections.singletonList(STARsolo); + c.INPUT_2 = Collections.singletonList(CELLRANGER); + c.TAGS_1 = Arrays.asList("CB", "CR"); + c.TAGS_2 = Arrays.asList("CB", "CR"); + c.BAM_OUTPUT_1 = TestUtils.getTempReportFile("compare_starsolo_cellranger_1.", ".bam"); + c.BAM_OUTPUT_2 = TestUtils.getTempReportFile("compare_starsolo_cellranger_2.", ".bam"); + c.TAG_VALUES_OUTPUT = TestUtils.getTempReportFile("compare_starsolo_cellranger", ".report"); + c.READ_COUNT_OUTPUT = TestUtils.getTempReportFile("compare_starsolo_cellranger", ".read_count"); + c.useFixedLengthBaseCompression = false; + c.STRICT = false; + c.BAM_OUTPUT_1.deleteOnExit(); + c.BAM_OUTPUT_2.deleteOnExit(); + c.TAG_VALUES_OUTPUT.deleteOnExit(); + c.READ_COUNT_OUTPUT.deleteOnExit(); + int result = c.doWork(); + log.info("Result file: " + c.TAG_VALUES_OUTPUT.getAbsolutePath()); + Assert.assertEquals(result, 0); + boolean outputSame = TestUtils.testFilesSame(test1Expected, c.TAG_VALUES_OUTPUT); + Assert.assertTrue(outputSame); + } + + @Test + public void testCompareBAMTagValuesPairedReads() { + // test 2 - DROPSEQ is a paired end read version of STARsolo. + CompareBAMTagValues c = new CompareBAMTagValues(); + c.INPUT_1 = Collections.singletonList(DROPSEQ); + c.INPUT_2 = Collections.singletonList(CELLRANGER); + c.TAGS_1 = Arrays.asList("CB", "CR"); + c.TAGS_2 = Arrays.asList("CB", "CR"); + c.BAM_OUTPUT_1 = TestUtils.getTempReportFile("compare_starsolo_dropseq_1.", ".bam"); + c.BAM_OUTPUT_2 = TestUtils.getTempReportFile("compare_starsolo_dropseq_2.", ".bam"); + c.TAG_VALUES_OUTPUT = TestUtils.getTempReportFile("compare_starsolo_dropseq", ".report"); + c.READ_COUNT_OUTPUT = TestUtils.getTempReportFile("compare_starsolo_dropseq", ".read_count"); + c.useFixedLengthBaseCompression = true; + c.STRICT = false; + c.BAM_OUTPUT_1.deleteOnExit(); + c.BAM_OUTPUT_2.deleteOnExit(); + c.TAG_VALUES_OUTPUT.deleteOnExit(); + c.READ_COUNT_OUTPUT.deleteOnExit(); + int result = c.doWork(); + log.info("Result file: " + c.TAG_VALUES_OUTPUT.getAbsolutePath()); + Assert.assertEquals(result, 0); + boolean outputSame = TestUtils.testFilesSame(test1Expected, c.TAG_VALUES_OUTPUT); + Assert.assertTrue(outputSame); + + } + + + /** + * READ_NAME_COMPARATOR TESTS + * (The things we do for code coverage...) + */ + + private SAMRecordSetBuilder builder; + private List records; + + @BeforeMethod + public void setUp() { + builder = new SAMRecordSetBuilder(); + records = new ArrayList<>(); + } + + @Test + public void testPairedReadOrderComparator_FirstOfPair() { + builder.addUnmappedFragment("read1_1"); + builder.addUnmappedFragment("read1_2"); + builder.iterator().forEachRemaining(records::add); + + SAMRecord read1 = records.get(0); + read1.setReadPairedFlag(true); + read1.setFirstOfPairFlag(true); + + SAMRecord read2 = records.get(1); + read2.setReadPairedFlag(true); + read2.setSecondOfPairFlag(true); + + int result = CompareBAMTagValues.PAIRED_READ_ORDER_COMPARATOR.compare(read1, read2); + assertTrue(result < 0, "First of pair should come before second of pair"); + } + + @Test + public void testPairedReadOrderComparator_SecondOfPair() { + builder.addUnmappedFragment("read1_1"); + builder.addUnmappedFragment("read1_2"); + builder.iterator().forEachRemaining(records::add); + + SAMRecord read1 = records.get(0); + read1.setReadPairedFlag(true); + read1.setSecondOfPairFlag(true); + + SAMRecord read2 = records.get(1); + read2.setReadPairedFlag(true); + read2.setFirstOfPairFlag(true); + + int result = CompareBAMTagValues.PAIRED_READ_ORDER_COMPARATOR.compare(read1, read2); + assertTrue(result > 0, "Second of pair should come after first of pair"); + } + + @Test + public void testPairedReadOrderComparator_UnpairedRead() { + builder.addUnmappedFragment("read1_1"); + builder.addUnmappedFragment("read1_2"); + builder.iterator().forEachRemaining(records::add); + + SAMRecord read1 = records.get(0); + read1.setReadPairedFlag(false); + + SAMRecord read2 = records.get(1); + read2.setReadPairedFlag(true); + read2.setFirstOfPairFlag(true); + + int result = CompareBAMTagValues.PAIRED_READ_ORDER_COMPARATOR.compare(read1, read2); + assertTrue(result > 0, "Unpaired read should come after paired read"); + } + + @Test + public void testPairedReadOrderComparator_BothUnpaired() { + builder.addUnmappedFragment("read1_1"); + builder.addUnmappedFragment("read2_1"); + builder.iterator().forEachRemaining(records::add); + + SAMRecord read1 = records.get(0); + read1.setReadPairedFlag(false); + + SAMRecord read2 = records.get(1); + read2.setReadPairedFlag(false); + + int result = CompareBAMTagValues.PAIRED_READ_ORDER_COMPARATOR.compare(read1, read2); + assertEquals(result, 0, "Both unpaired reads should be considered equal"); + } + + @Test + public void testCustomCommandLineValidation() { + CompareBAMTagValues compareBAMTagValues = new CompareBAMTagValues(); + + // Case 1: TAGS_1 and TAGS_2 length mismatch + compareBAMTagValues.TAGS_1 = Arrays.asList("CB", "CR"); + compareBAMTagValues.TAGS_2 = Collections.singletonList("CB"); + String[] validationResult = compareBAMTagValues.customCommandLineValidation(); + assertEquals(validationResult.length, 1); + assertEquals(validationResult[0], "TAGS_1 and TAGS_2 must be the same length."); + + // Case 2: Writable files + compareBAMTagValues.TAGS_1 = Arrays.asList("CB", "CR"); + compareBAMTagValues.TAGS_2 = Arrays.asList("CB", "CR"); + compareBAMTagValues.BAM_OUTPUT_1 = new File("test_output_1.bam"); + compareBAMTagValues.BAM_OUTPUT_2 = new File("test_output_2.bam"); + compareBAMTagValues.READ_COUNT_OUTPUT = new File("read_count_output.txt"); + compareBAMTagValues.TAG_VALUES_OUTPUT = new File("tag_values_output.txt"); + validationResult = compareBAMTagValues.customCommandLineValidation(); + assertNull(validationResult); + + } + +} diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/DNACompressorTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/DNACompressorTest.java new file mode 100644 index 00000000..a32f9698 --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/DNACompressorTest.java @@ -0,0 +1,86 @@ +package org.broadinstitute.dropseqrna.utils; + +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; + +import static org.testng.Assert.*; + +public class DNACompressorTest { + + @Test(dataProvider = "dnaStringLists") + public void testCompressListRoundTrip(String[] dnaList) { + // Compress the DNA sequences + byte[] compressed = DNACompressor.compressList(Arrays.asList(dnaList)); + int[] lengths = getListLengths(dnaList); + + // Decompress the DNA sequences + List result = DNACompressor.decompressList(compressed, lengths); + // Verify that the decompressed sequences match the original + assertEquals(result, Arrays.asList(dnaList)); + } + + @Test(dataProvider = "dnaStrings") + public void testCompressRoundTrip(String dna) { + // Compress the DNA sequence + byte[] compressed = DNACompressor.compress(dna); + // Decompress the DNA sequence + String result = DNACompressor.decompress(compressed); + // Verify that the decompressed sequence matches the original + assertEquals(result, dna); + } + + @Test + public void testGetDecompressedComparator() { + // Sample DNA sequences + List dnaList1 = Arrays.asList("ACGT", "TTACG", "GCGTACGTAC"); + List dnaList2 = Arrays.asList("ACGT", "TTACG", "GCGTACGTAC"); + List dnaList3 = Arrays.asList("ACGT", "TTACG", "GCGTANGTAC"); + + // Compress the DNA sequences + byte[] compressed1 = DNACompressor.compressList(dnaList1); + byte[] compressed2 = DNACompressor.compressList(dnaList2); + byte[] compressed3 = DNACompressor.compressList(dnaList3); + + // Lengths of the original sequences + int[] lengths = dnaList1.stream().mapToInt(String::length).toArray(); + + // Get the comparator + Comparator comparator = DNACompressor.getDecompressedComparator(lengths); + + // Compare the compressed sequences + assertEquals(comparator.compare(compressed1, compressed2), 0); + assertTrue(comparator.compare(compressed1, compressed3) < 0); + assertTrue(comparator.compare(compressed3, compressed1) > 0); + } + + + + @DataProvider(name = "dnaStrings") + public Object[][] dnaStrings() { + return new Object[][] { + {"ACGT"}, + {"TTACG"}, + {"GCGTACGTAC"}, + {"GCGTANGTAC"}, + }; + } + + @DataProvider(name = "dnaStringLists") + public Object[][] dnaStringLists() { + return new Object[][] { + {new String[] {"ACGT", "TTACG", "GCGTACGTAC"}}, + {new String[] {"ACGT", "TTACG", null, "GCGTACGTAC"}}, + {new String[] {"ACGT", "TTACG", null, "GCGTANGTAC"}} + }; + } + + private int [] getListLengths(String [] dnaList) { + return Arrays.stream(dnaList).mapToInt(dna -> dna == null ? 0 : dna.length()).toArray(); + } + + +} \ No newline at end of file diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/FilterBamByTagTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/FilterBamByTagTest.java index 91e9db7f..4aa82b77 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/utils/FilterBamByTagTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/FilterBamByTagTest.java @@ -105,15 +105,10 @@ public void testDoWorkPaired () throws IOException { Assert.assertEquals(PAIRED_READS_ACCEPTED, metrics.get(0).READS_ACCEPTED); Assert.assertEquals(PAIRED_READS_REJECTED, metrics.get(0).READS_REJECTED); - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=PAIRED_INPUT_FILE_FILTERED; - cbtv.INPUT_2=f.OUTPUT; - List tags = new ArrayList<>(); - tags.add("XC"); - cbtv.TAGS=tags; - int r = cbtv.doWork(); - Assert.assertEquals(r, 0); - + // test with tag values file. + List tags = Collections.singletonList("XC"); + compareBAMTagValues(PAIRED_INPUT_FILE_FILTERED, f.OUTPUT, tags, 0); + } @Test @@ -129,14 +124,10 @@ public void testDoWorkUnPaired () throws IOException { Assert.assertEquals(UNPAIRED_READS_ACCEPTED, metrics.get(0).READS_ACCEPTED); Assert.assertEquals(UNPAIRED_READS_REJECTED, metrics.get(0).READS_REJECTED); - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=UNPAIRED_INPUT_FILE_FILTERED; - cbtv.INPUT_2=f.OUTPUT; - List tags = new ArrayList<>(); - tags.add("XC"); - cbtv.TAGS=tags; - int r = cbtv.doWork(); - Assert.assertEquals(r, 0); + // test with tag values file. + List tags = Collections.singletonList("XC"); + compareBAMTagValues(UNPAIRED_INPUT_FILE_FILTERED, f.OUTPUT, tags, 0); + // test alternate path without tag values file. f.INPUT=Collections.singletonList(new PicardHtsPath(UNPAIRED_INPUT_FILE)); @@ -148,13 +139,9 @@ public void testDoWorkUnPaired () throws IOException { f.OUTPUT.deleteOnExit(); Assert.assertEquals(f.doWork(), 0); - - cbtv.INPUT_1=UNPAIRED_INPUT_FILE_FILTERED_AAAGTAGAGTGG; - cbtv.INPUT_2=f.OUTPUT; - cbtv.TAGS=tags; - r = cbtv.doWork(); - Assert.assertEquals(r, 0); - + // test with tag values file. + tags = Collections.singletonList("XC"); + compareBAMTagValues(UNPAIRED_INPUT_FILE_FILTERED_AAAGTAGAGTGG, f.OUTPUT, tags, 0); } @@ -400,4 +387,15 @@ public void testArgErrors () throws IOException { } + private void compareBAMTagValues(File input1, File input2, List tags, int expectedProgramValue) { + CompareBAMTagValues cbtv = new CompareBAMTagValues(); + cbtv.INPUT_1 = Collections.singletonList(new PicardHtsPath(input1)); + cbtv.INPUT_2 = Collections.singletonList(new PicardHtsPath(input2)); + cbtv.TAGS_1 = tags; + cbtv.TAGS_2 = tags; + cbtv.STRICT = true; + int result = cbtv.doWork(); + Assert.assertTrue(result == expectedProgramValue); + } + } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/SplitBamByCellTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/SplitBamByCellTest.java index b2ecf70f..fb1f0ae4 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/utils/SplitBamByCellTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/SplitBamByCellTest.java @@ -110,12 +110,10 @@ public void testDoWork() { TestUtils.testMetricsFilesEqual(originalMetrics, mergedMetrics); // Compare the XC and XM tags for the input test BAM and the merged BAM files - final CompareBAMTagValues tagsComparator = new CompareBAMTagValues(); - tagsComparator.INPUT_1 = TEST_BAM; - tagsComparator.INPUT_2 = mergedOutputBAM; - tagsComparator.TAGS = new ArrayList<>(Arrays.asList("XC", "XM")); - Assert.assertEquals(tagsComparator.doWork(), 0); + List tags = new ArrayList<>(Arrays.asList("XC", "XM")); + compareBAMTagValues(TEST_BAM, mergedOutputBAM, tags, 0); + // Compare the split BAM report to the expected report Assert.assertTrue(TestUtils.testMetricsFilesEqual(report, EXPECTED_REPORT)); // Manifest content will not be identical because of temp file names, so just compare fields expected @@ -447,4 +445,16 @@ public Object[][] testDeleteInputsAndDefaultOutputsDataProvider() { {null, false}, }; } + + private void compareBAMTagValues(File input1, File input2, List tags, int expectedProgramValue) { + CompareBAMTagValues cbtv = new CompareBAMTagValues(); + cbtv.INPUT_1 = Collections.singletonList(new PicardHtsPath(input1)); + cbtv.INPUT_2 = Collections.singletonList(new PicardHtsPath(input2)); + cbtv.TAGS_1 = tags; + cbtv.TAGS_2 = tags; + cbtv.STRICT = true; + int result = cbtv.doWork(); + Assert.assertTrue(result == expectedProgramValue); + } + } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/editdistance/CollapseBarcodesInPlaceTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/editdistance/CollapseBarcodesInPlaceTest.java index b152de40..5ed18a39 100644 --- a/src/tests/java/org/broadinstitute/dropseqrna/utils/editdistance/CollapseBarcodesInPlaceTest.java +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/editdistance/CollapseBarcodesInPlaceTest.java @@ -2,16 +2,14 @@ import java.io.File; import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import org.broadinstitute.dropseqrna.utils.CompareBAMTagValues; import org.broadinstitute.dropseqrna.utils.ObjectCounter; import org.testng.annotations.Test; import org.testng.Assert; +import picard.nio.PicardHtsPath; public class CollapseBarcodesInPlaceTest { @@ -103,33 +101,16 @@ public void testDoWorkSubstitution1 () { p.OUTPUT=out; p.doWork(); + List tags = Collections.singletonList("ZC"); + // the two expected data sets differ. - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=TEST_SUB; - cbtv.INPUT_2=TEST_INDEL; - List tags = new ArrayList<>(); - tags.add("ZC"); - cbtv.TAGS=tags; - int result = cbtv.doWork(); - Assert.assertTrue(result==1); - - // test against the correct answer for substiution - cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=TEST_SUB; - cbtv.INPUT_2=out; - tags = new ArrayList<>(); - tags.add("ZC"); - cbtv.TAGS=tags; - result = cbtv.doWork(); - Assert.assertTrue(result==0); + compareBAMTagValues(TEST_SUB, TEST_INDEL, tags, 1); + + // test against the correct answer for substitution + compareBAMTagValues(TEST_SUB, out, tags, 0); // test against the wrong answer for substitution - cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=TEST_INDEL; - cbtv.INPUT_2=out; - cbtv.TAGS=tags; - result = cbtv.doWork(); - Assert.assertTrue(result==1); + compareBAMTagValues(TEST_INDEL, out, tags, 1); } @Test @@ -151,33 +132,17 @@ public void testDoWorkSubstitution1Threaded () { p.OUTPUT=out; p.doWork(); + List tags = Collections.singletonList("ZC"); + // the two expected data sets differ. - CompareBAMTagValues cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=TEST_SUB; - cbtv.INPUT_2=TEST_INDEL; - List tags = new ArrayList<>(); - tags.add("ZC"); - cbtv.TAGS=tags; - int result = cbtv.doWork(); - Assert.assertTrue(result==1); - - // test against the correct answer for substiution - cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=TEST_SUB; - cbtv.INPUT_2=out; - tags = new ArrayList<>(); - tags.add("ZC"); - cbtv.TAGS=tags; - result = cbtv.doWork(); - Assert.assertTrue(result==0); + compareBAMTagValues(TEST_SUB, TEST_INDEL, tags, 1); + + // test against the correct answer for substitution + compareBAMTagValues(TEST_SUB, out, tags, 0); // test against the wrong answer for substitution - cbtv = new CompareBAMTagValues(); - cbtv.INPUT_1=TEST_INDEL; - cbtv.INPUT_2=out; - cbtv.TAGS=tags; - result = cbtv.doWork(); - Assert.assertTrue(result==1); + compareBAMTagValues(TEST_INDEL, out, tags, 1); + } private File getTempReportFile () { File tempFile=null; @@ -189,4 +154,15 @@ private File getTempReportFile () { } return tempFile; } + + private void compareBAMTagValues(File input1, File input2, List tags, int expectedProgramValue) { + CompareBAMTagValues cbtv = new CompareBAMTagValues(); + cbtv.INPUT_1 = Collections.singletonList(new PicardHtsPath(input1)); + cbtv.INPUT_2 = Collections.singletonList(new PicardHtsPath(input2)); + cbtv.TAGS_1 = tags; + cbtv.TAGS_2 = tags; + cbtv.STRICT = true; + int result = cbtv.doWork(); + Assert.assertTrue(result == expectedProgramValue); + } } diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/BAMTagCleanupIteratorTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/BAMTagCleanupIteratorTest.java new file mode 100644 index 00000000..554e5b6d --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/BAMTagCleanupIteratorTest.java @@ -0,0 +1,97 @@ +package org.broadinstitute.dropseqrna.utils.readiterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordSetBuilder; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Pattern; + +import static org.testng.Assert.*; + +public class BAMTagCleanupIteratorTest { + + private List records; + private Iterator underlyingIterator; + + @BeforeMethod + public void setUp() { + SAMRecordSetBuilder builder = new SAMRecordSetBuilder(); + builder.addUnmappedFragment("read1"); + builder.addUnmappedFragment("read2"); + records = new ArrayList<>(); + builder.iterator().forEachRemaining(records::add); + } + + @Test + public void testPrefixRemoval() { + records.forEach(record -> record.setAttribute("XT", "prefix_value")); + underlyingIterator = records.iterator(); + BAMTagCleanupIterator iterator = new BAMTagCleanupIterator.Builder(underlyingIterator) + .tag("XT") + .prefixToRemove("prefix_") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getStringAttribute("XT"), "value"); + } + + @Test + public void testSuffixRemoval() { + records.forEach(record -> record.setAttribute("XT", "value_suffix")); + underlyingIterator = records.iterator(); + BAMTagCleanupIterator iterator = new BAMTagCleanupIterator.Builder(underlyingIterator) + .tag("XT") + .suffixToRemove("_suffix") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getStringAttribute("XT"), "value"); + } + + @Test + public void testPatternRemoval() { + records.forEach(record -> record.setAttribute("XT", "value_to_remove")); + underlyingIterator = records.iterator(); + BAMTagCleanupIterator iterator = new BAMTagCleanupIterator.Builder(underlyingIterator) + .tag("XT") + .patternToRemove(Pattern.compile("_to_remove")) + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getStringAttribute("XT"), "value"); + } + + @Test + public void testPrefixAndSuffixAddition() { + records.forEach(record -> record.setAttribute("XT", "value")); + underlyingIterator = records.iterator(); + BAMTagCleanupIterator iterator = new BAMTagCleanupIterator.Builder(underlyingIterator) + .tag("XT") + .prefixToAdd("prefix_") + .suffixToAdd("_suffix") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getStringAttribute("XT"), "prefix_value_suffix"); + } + + @Test + public void testCombinedTransformations() { + records.forEach(record -> record.setAttribute("XT", "prefix_value_suffix")); + underlyingIterator = records.iterator(); + BAMTagCleanupIterator iterator = new BAMTagCleanupIterator.Builder(underlyingIterator) + .tag("XT") + .prefixToRemove("prefix_") + .suffixToRemove("_suffix") + .prefixToAdd("new_prefix_") + .suffixToAdd("_new_suffix") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getStringAttribute("XT"), "new_prefix_value_new_suffix"); + } +} \ No newline at end of file diff --git a/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/ReadNameCleanupIteratorTest.java b/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/ReadNameCleanupIteratorTest.java new file mode 100644 index 00000000..47b7aa15 --- /dev/null +++ b/src/tests/java/org/broadinstitute/dropseqrna/utils/readiterators/ReadNameCleanupIteratorTest.java @@ -0,0 +1,87 @@ +package org.broadinstitute.dropseqrna.utils.readiterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordSetBuilder; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Pattern; + +import static org.testng.Assert.*; + +public class ReadNameCleanupIteratorTest { + + private List records; + private Iterator underlyingIterator; + + @BeforeMethod + public void setUp() { + SAMRecordSetBuilder builder = new SAMRecordSetBuilder(); + builder.addUnmappedFragment("prefix_read1_suffix"); + builder.addUnmappedFragment("prefix_read2_suffix"); + records = new ArrayList<>(); + builder.iterator().forEachRemaining(records::add); + } + + @Test + public void testPrefixRemoval() { + underlyingIterator = records.iterator(); + ReadNameCleanupIterator iterator = new ReadNameCleanupIterator.Builder(underlyingIterator) + .prefixToRemove("prefix_") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getReadName(), "read1_suffix"); + } + + @Test + public void testSuffixRemoval() { + underlyingIterator = records.iterator(); + ReadNameCleanupIterator iterator = new ReadNameCleanupIterator.Builder(underlyingIterator) + .suffixToRemove("_suffix") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getReadName(), "prefix_read1"); + } + + @Test + public void testPatternRemoval() { + underlyingIterator = records.iterator(); + ReadNameCleanupIterator iterator = new ReadNameCleanupIterator.Builder(underlyingIterator) + .patternToRemove(Pattern.compile("_read1_")) + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getReadName(), "prefixsuffix"); + } + + @Test + public void testPrefixAndSuffixAddition() { + underlyingIterator = records.iterator(); + ReadNameCleanupIterator iterator = new ReadNameCleanupIterator.Builder(underlyingIterator) + .prefixToAdd("new_prefix_") + .suffixToAdd("_new_suffix") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getReadName(), "new_prefix_prefix_read1_suffix_new_suffix"); + } + + @Test + public void testCombinedTransformations() { + underlyingIterator = records.iterator(); + ReadNameCleanupIterator iterator = new ReadNameCleanupIterator.Builder(underlyingIterator) + .prefixToRemove("prefix_") + .suffixToRemove("_suffix") + .prefixToAdd("new_prefix_") + .suffixToAdd("_new_suffix") + .build(); + + SAMRecord result = iterator.next(); + assertEquals(result.getReadName(), "new_prefix_read1_new_suffix"); + } +} \ No newline at end of file diff --git a/testdata/org/broadinstitute/dropseq/utils/compare_tags_CellRanger.sam b/testdata/org/broadinstitute/dropseq/utils/compare_tags_CellRanger.sam new file mode 100644 index 00000000..9eb0dbad --- /dev/null +++ b/testdata/org/broadinstitute/dropseq/utils/compare_tags_CellRanger.sam @@ -0,0 +1,13 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:chr1 LN:248956422 +@CO Dataset 2: Data set mimics CellRanger outputs - "-1" suffix on CB tag and suffix \1 on read name +@CO CellRanger does not encode the CB tag when repair fails, Optimus encodes '-' as the missing value. +pass_all_tags_agree/1 0 chr1 1000 255 50M * 0 0 * * CB:Z:TTTCGGGAC-1 CR:Z:TTTCGGGAC +pass_all_tags_agree2/1 0 chr1 1000 255 50M * 0 0 * * CB:Z:TTTCGGGAC-1 CR:Z:TTTCGGGAC +fail_same_unrepaired_diff_repaired/1 0 chr1 1010 255 50M * 0 0 * * CB:Z:CCCCGGGAT-1 CR:Z:CCCCGGGTT +fail_diff_unrepaired_same_repaired/1 0 chr1 1020 255 50M * 0 0 * * CB:Z:GGGCGGGAA-1 CR:Z:GGGCGGGAC +fail_diff_repaired_diff_unrepaired/1 0 chr1 1030 255 50M * 0 0 * * CB:Z:AAACGGGTT-1 CR:Z:AAACGGGAA +fail_unset_repaired_tag/1 0 chr1 1030 255 50M * 0 0 * * CR:Z:AAAAAAAAA +fail_missing_data_repaired_tag_discordant/1 0 chr1 1030 255 50M * 0 0 * * CR:Z:AAAAAAAAA +pass_missing_data_repaired_tag_concordant/1 0 chr1 1030 255 50M * 0 0 * * CR:Z:AAAAAAAAA +pass_read_only_cellranger/1 0 chr1 1030 255 50M * 0 0 * * CB:Z:TTTTTTTTT CR:Z:TTTTTTTTT diff --git a/testdata/org/broadinstitute/dropseq/utils/compare_tags_DropSeq.sam b/testdata/org/broadinstitute/dropseq/utils/compare_tags_DropSeq.sam new file mode 100644 index 00000000..6dd3e6b4 --- /dev/null +++ b/testdata/org/broadinstitute/dropseq/utils/compare_tags_DropSeq.sam @@ -0,0 +1,23 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:chr1 LN:248956422 +@CO Dataset 3: Paired-end reads from DropSeq. +@CO First read of each pair has no tags (flag 77, unmapped). +@CO Second read of each pair has tags matching Dataset 2 (CellRanger, flag 141, reverse strand, unmapped). +pass_all_tags_agree 77 * 0 0 * * 0 0 * * +pass_all_tags_agree 141 * 0 0 * * 0 0 * * CB:Z:TTTCGGGAC CR:Z:TTTCGGGAC +pass_all_tags_agree2 77 * 0 0 * * 0 0 * * +pass_all_tags_agree2 141 * 0 0 * * 0 0 * * CB:Z:TTTCGGGAC CR:Z:TTTCGGGAC +fail_same_unrepaired_diff_repaired 77 * 0 0 * * 0 0 * * +fail_same_unrepaired_diff_repaired 141 * 0 0 * * 0 0 * * CB:Z:CCCCGGGAC CR:Z:CCCCGGGTT +fail_diff_unrepaired_same_repaired 77 * 0 0 * * 0 0 * * +fail_diff_unrepaired_same_repaired 141 * 0 0 * * 0 0 * * CB:Z:GGGCGGGAA CR:Z:GGGCGGGAT +fail_diff_repaired_diff_unrepaired 77 * 0 0 * * 0 0 * * +fail_diff_repaired_diff_unrepaired 141 * 0 0 * * 0 0 * * CB:Z:AAACGGGAA CR:Z:AAACGGGTT +fail_unset_repaired_tag 77 * 0 0 * * 0 0 * * +fail_unset_repaired_tag 141 * 0 0 * * 0 0 * * CB:Z:TTTTTTTTT CR:Z:AAAAAAAAA +fail_missing_data_repaired_tag_discordant 77 * 0 0 * * 0 0 * * +fail_missing_data_repaired_tag_discordant 141 * 0 0 * * 0 0 * * CB:Z:TTTTTTTTT CR:Z:AAAAAAAAA +pass_missing_data_repaired_tag_concordant 77 * 0 0 * * 0 0 * * +pass_missing_data_repaired_tag_concordant 141 * 0 0 * * 0 0 * * CB:Z:- CR:Z:AAAAAAAAA +test_read_only_dropseq 77 * 0 0 * * 0 0 * * +test_read_only_dropseq 141 * 0 0 * * 0 0 * * CB:Z:- CR:Z:AAAAAAAAA diff --git a/testdata/org/broadinstitute/dropseq/utils/compare_tags_STARsolo.sam b/testdata/org/broadinstitute/dropseq/utils/compare_tags_STARsolo.sam new file mode 100644 index 00000000..ffc50656 --- /dev/null +++ b/testdata/org/broadinstitute/dropseq/utils/compare_tags_STARsolo.sam @@ -0,0 +1,13 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:chr1 LN:248956422 +@CO Dataset 1: Data set mimics STARsolo outputs +@CO CellRanger does not encode the CB tag when repair fails, STARsolo encodes '-' as the missing value. +pass_all_tags_agree 0 chr1 1000 255 50M * 0 0 * * CB:Z:TTTCGGGAC CR:Z:TTTCGGGAC +pass_all_tags_agree2 0 chr1 1000 255 50M * 0 0 * * CB:Z:TTTCGGGAC CR:Z:TTTCGGGAC +fail_same_unrepaired_diff_repaired 0 chr1 1010 255 50M * 0 0 * * CB:Z:CCCCGGGAC CR:Z:CCCCGGGTT +fail_diff_unrepaired_same_repaired 0 chr1 1020 255 50M * 0 0 * * CB:Z:GGGCGGGAA CR:Z:GGGCGGGAT +fail_diff_repaired_diff_unrepaired 0 chr1 1030 255 50M * 0 0 * * CB:Z:AAACGGGAA CR:Z:AAACGGGTT +fail_unset_repaired_tag 0 chr1 1030 255 50M * 0 0 * * CB:Z:TTTTTTTTT CR:Z:AAAAAAAAA +fail_missing_data_repaired_tag_discordant 0 chr1 1030 255 50M * 0 0 * * CB:Z:TTTTTTTTT CR:Z:AAAAAAAAA +pass_missing_data_repaired_tag_concordant 0 chr1 1030 255 50M * 0 0 * * CB:Z:- CR:Z:AAAAAAAAA +test_read_only_starsolo 0 chr1 1030 255 50M * 0 0 * * CB:Z:- CR:Z:AAAAAAAAA diff --git a/testdata/org/broadinstitute/dropseq/utils/testCompareBAMTagValuesUnpairedReads_report.txt b/testdata/org/broadinstitute/dropseq/utils/testCompareBAMTagValuesUnpairedReads_report.txt new file mode 100644 index 00000000..75a20ecb --- /dev/null +++ b/testdata/org/broadinstitute/dropseq/utils/testCompareBAMTagValuesUnpairedReads_report.txt @@ -0,0 +1,7 @@ +COUNT I1_CB I2_CB I1_CR I2_CR +1 NA NA AAAAAAAAA AAAAAAAAA +1 AAACGGGAA AAACGGGTT AAACGGGTT AAACGGGAA +1 CCCCGGGAC CCCCGGGAT CCCCGGGTT CCCCGGGTT +1 GGGCGGGAA GGGCGGGAA GGGCGGGAT GGGCGGGAC +2 TTTCGGGAC TTTCGGGAC TTTCGGGAC TTTCGGGAC +2 TTTTTTTTT NA AAAAAAAAA AAAAAAAAA