From 35889627e8644c42d9d50c146b0357b63e3f7de4 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 19 Oct 2020 12:54:18 -0400
Subject: [PATCH 01/26] [cudamapper] Add anchor chain output functions to
 chainer_utils.cuh.

Adds two functions required for retrieving the anchors within an overlap.
---
 cudamapper/CMakeLists.txt        |   1 +
 cudamapper/src/chainer_utils.cu  | 534 +++++++++++++++++++++++++++++++
 cudamapper/src/chainer_utils.cuh | 183 +++++++++++
 3 files changed, 718 insertions(+)
 create mode 100644 cudamapper/src/chainer_utils.cu
 create mode 100644 cudamapper/src/chainer_utils.cuh
diff --git a/cudamapper/CMakeLists.txt b/cudamapper/CMakeLists.txt
index 5b6d3be99..00b1c2054 100644
--- a/cudamapper/CMakeLists.txt
+++ b/cudamapper/CMakeLists.txt
@@ -46,6 +46,7 @@ endif()
 
 cuda_add_library(${MODULE_NAME}
         src/application_parameters.cpp
+        src/chainer_utils.cu
         src/cudamapper.cpp
         src/index_batcher.cu
         src/index_descriptor.cpp
diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
new file mode 100644
index 000000000..97dde816d
--- /dev/null
+++ b/cudamapper/src/chainer_utils.cu
@@ -0,0 +1,534 @@
+/*
+* Copyright 2020 NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "chainer_utils.cuh"
+
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+
+// Needed for accumulate - remove when ported to cuda
+#include <numeric>
+#include <limits>
+
+#include <cub/cub.cuh>
+#include <thrust/execution_policy.h>
+
+#include <claraparabricks/genomeworks/utils/cudautils.hpp>
+
+namespace claraparabricks
+{
+
+namespace genomeworks
+{
+
+namespace cudamapper
+{
+namespace chainerutils
+{
+
+__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b)
+{
+    return a.query_read_id_ == b.query_read_id_ && a.target_read_id_ == b.target_read_id_;
+}
+
+__device__ bool operator==(const QueryReadID& a, const QueryReadID& b)
+{
+    return a.query_read_id_ == b.query_read_id_;
+}
+
+__device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
+{
+    Overlap overlap;
+    overlap.num_residues_ = num_anchors;
+
+    overlap.query_read_id_  = start.query_read_id_;
+    overlap.target_read_id_ = start.target_read_id_;
+    assert(start.query_read_id_ == end.query_read_id_ && start.target_read_id_ == end.target_read_id_);
+
+    overlap.query_start_position_in_read_ = min(start.query_position_in_read_, end.query_position_in_read_);
+    overlap.query_end_position_in_read_   = max(start.query_position_in_read_, end.query_position_in_read_);
+    bool is_negative_strand               = end.target_position_in_read_ < start.target_position_in_read_;
+    if (is_negative_strand)
+    {
+        overlap.relative_strand                = RelativeStrand::Reverse;
+        overlap.target_start_position_in_read_ = end.target_position_in_read_;
+        overlap.target_end_position_in_read_   = start.target_position_in_read_;
+    }
+    else
+    {
+        overlap.relative_strand                = RelativeStrand::Forward;
+        overlap.target_start_position_in_read_ = start.target_position_in_read_;
+        overlap.target_end_position_in_read_   = end.target_position_in_read_;
+    }
+    return overlap;
+}
+
+void allocate_anchor_chains(device_buffer<Overlap> overlaps,
+                            device_buffer<int32_t>& unrolled_anchor_chains,
+                            const int32_t num_overlaps,
+                            int32_t& num_total_anchors,
+                            DefaultDeviceAllocator& _allocator,
+                            cudaStream_t& _cuda_stream)
+{
+    // sum the number of chains across all overlaps
+    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
+    void* d_temp_storage           = nullptr;
+    std::size_t temp_storage_bytes = 0;
+    OverlapToNumResiduesOp overlap_residue_count_op;
+    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, Overlap*> d_residue_counts(overlaps.data(), overlap_residue_count_op);
+
+    device_buffer<int32_t> d_num_total_anchors(1, _allocator, _cuda_stream);
+    device_buffer<int32_t> d_chain_starts(num_overlaps, _allocator, _cuda_stream);  // Index of the first anchor in the anchors array for an overlap.
+    device_buffer<int32_t> d_chain_lengths(num_overlaps, _allocator, _cuda_stream); // Equivalent to num_residues
+
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  d_chain_lengths.data(),
+                                  d_chain_starts.data(),
+                                  num_overlaps,
+                                  _cuda_stream);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  d_chain_lengths.data(),
+                                  d_chain_starts.data(),
+                                  num_overlaps,
+                                  _cuda_stream);
+
+    int32_t last_start  = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream);
+    int32_t last_length = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream);
+    num_total_anchors   = last_start + last_length;
+
+    unrolled_anchor_chains.clear_and_resize(num_total_anchors);
+}
+
+__global__ void output_overlap_chains(const Overlap* overlaps,
+                                      const Anchor* anchors,
+                                      const bool* select_mask,
+                                      const int32_t* predecessors,
+                                      int32_t* anchor_chains,
+                                      int32_t* anchor_chain_starts,
+                                      int32_t num_overlaps,
+                                      bool check_mask)
+{
+    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int32_t stride      = blockDim.x * gridDim.x;
+
+    // Processes one overlap per iteration,
+    // "i" corresponds to an overlap
+    for (int i = d_thread_id; i < num_overlaps; i += stride)
+    {
+        // index within this chain of anchors (i.e., the anchors within a single overlap)
+
+        if (!check_mask || (check_mask & select_mask[i]))
+        {
+            int32_t anchor_chain_index = 0;
+            // As chaining proceeds backwards (i.e., it's a backtrace),
+            // we need to fill the new anchor chain array in in reverse order.
+            int32_t index = anchor_chain_starts[i];
+            while (index != -1)
+            {
+                anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index)] = index;
+                int32_t pred                                                                             = predecessors[index];
+                index                                                                                    = pred;
+                ++anchor_chain_index;
+            }
+        }
+    }
+}
+
+__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
+                                              Overlap* overlaps,
+                                              double* scores,
+                                              bool* max_select_mask,
+                                              int32_t* predecessors,
+                                              const int32_t n_anchors,
+                                              const int32_t min_score)
+{
+    const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (d_tid < n_anchors)
+    {
+
+        int32_t global_overlap_index = d_tid;
+        if (scores[d_tid] >= min_score)
+        {
+
+            int32_t index                = global_overlap_index;
+            int32_t first_index          = index;
+            int32_t num_anchors_in_chain = 0;
+            Anchor final_anchor          = anchors[global_overlap_index];
+
+            while (index != -1)
+            {
+                first_index  = index;
+                int32_t pred = predecessors[index];
+                if (pred != -1)
+                {
+                    max_select_mask[pred] = false;
+                }
+                num_anchors_in_chain++;
+                index = predecessors[index];
+            }
+            Anchor first_anchor            = anchors[first_index];
+            overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
+            // Overlap final_overlap          = overlaps[global_overlap_index];
+            // printf("%d %d %d %d %d %d %d %f\n",
+            //        final_overlap.query_read_id_, final_overlap.query_start_position_in_read_, final_overlap.query_end_position_in_read_,
+            //        final_overlap.target_read_id_, final_overlap.target_start_position_in_read_, final_overlap.target_end_position_in_read_,
+            //        final_overlap.num_residues_,
+            //        final_score);
+        }
+        else
+        {
+            max_select_mask[global_overlap_index] = false;
+        }
+    }
+}
+
+__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts)
+{
+    std::int32_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (d_tid < n_starts)
+    {
+        ends[d_tid] = starts[d_tid] + lengths[d_tid] - 1;
+    }
+}
+
+// we take each query that is encoded as length (ie there are 4 queries that have the same id)
+// so we take the (how many queries are in that section) and divide it by the size of the tile to get the
+// number of tiles for that query
+__global__ void calculate_tiles_per_read(const std::int32_t* lengths,
+                                         const int32_t num_reads,
+                                         const int32_t tile_size,
+                                         std::int32_t* tiles_per_read)
+{
+    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (d_thread_id < num_reads)
+    {
+        // if the number of queries in that section are not evenly divisible, we need an extra block
+        // to accomadate for the leftovers
+        int32_t n_integer_blocks    = lengths[d_thread_id] / tile_size;
+        int32_t remainder           = lengths[d_thread_id] % tile_size;
+        tiles_per_read[d_thread_id] = remainder == 0 ? n_integer_blocks : n_integer_blocks + 1;
+    }
+}
+
+// TODO VI: threads may be overwriting each other.
+// because the number of queries is greater than number of tiles, each thread here has to write the query starts
+// for multiple tiles
+__global__ void calculate_tile_starts(const std::int32_t* query_starts,
+                                      const std::int32_t* tiles_per_query,
+                                      std::int32_t* tile_starts,
+                                      const int32_t tile_size,
+                                      const int32_t num_queries,
+                                      const std::int32_t* tiles_per_query_up_to_point)
+{
+    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int32_t stride      = blockDim.x * gridDim.x;
+    if (d_thread_id < num_queries)
+    {
+        // for each tile, we look up the query it corresponds to and offset it by the which tile in the query
+        // we're at multiplied by the total size of the tile
+        // TODO VI: this memory access pattern seems a little off? Thread i and thread i+1 would overwrite eachother, no?
+        for (int i = 0; i < tiles_per_query[d_thread_id]; i++)
+        {
+            //finds the offset in the ragged array and offsets to find start of "next" sub array
+            tile_starts[tiles_per_query_up_to_point[d_thread_id] + i] = query_starts[d_thread_id] + (i * tile_size);
+        }
+    }
+    //int counter = 0;
+    //for (int i =0; i < num_queries; i++)
+    //{
+    //    for (int j = 0; j < tiles_per_query[i]; j++)
+    //    {
+    //        tile_starts[counter] = query_starts[i] + (j * tile_size);
+    //        counter++;
+    //    }
+    //}
+}
+
+void encode_anchor_query_locations(const Anchor* anchors,
+                                   int32_t n_anchors,
+                                   int32_t tile_size,
+                                   device_buffer<int32_t>& query_starts,
+                                   device_buffer<int32_t>& query_lengths,
+                                   device_buffer<int32_t>& query_ends,
+                                   device_buffer<int32_t>& tiles_per_query,
+                                   device_buffer<int32_t>& tile_starts,
+                                   int32_t& n_queries,
+                                   int32_t& n_query_tiles,
+                                   DefaultDeviceAllocator& _allocator,
+                                   cudaStream_t& _cuda_stream,
+                                   int32_t block_size)
+{
+    AnchorToQueryReadIDOp anchor_to_read_op;
+    // This takes anchors and outputs and converts the anchors to QueryReadID types (references)
+    cub::TransformInputIterator<QueryReadID, AnchorToQueryReadIDOp, const Anchor*> d_queries(anchors, anchor_to_read_op);
+    // create buffer of size number of anchors
+    device_buffer<QueryReadID> d_query_read_ids(n_anchors, _allocator, _cuda_stream);
+    // vector of number of read ids...? I don't know what this is for
+    // This is used to store the length of the encoded sequence
+    device_buffer<int32_t> d_num_query_read_ids(1, _allocator, _cuda_stream);
+
+    // don't know what this is for yet
+    // this is for internal use for run length encoding
+    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
+    void* d_temp_storage           = nullptr;
+    std::size_t temp_storage_bytes = 0;
+
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_queries,
+                                       d_query_read_ids.data(),
+                                       query_lengths.data(),
+                                       d_num_query_read_ids.data(),
+                                       n_anchors);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_queries,
+                                       d_query_read_ids.data(),
+                                       query_lengths.data(), // this is the vector of encoded lengths
+                                       d_num_query_read_ids.data(),
+                                       n_anchors);
+    // this is just the "length" of the encoded sequence
+    n_queries          = cudautils::get_value_from_device(d_num_query_read_ids.data(), _cuda_stream);
+    d_temp_storage     = nullptr;
+    temp_storage_bytes = 0;
+
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  query_lengths.data(), // this is the vector of encoded lengths
+                                  query_starts.data(),  // at this point, this vector is empty
+                                  n_queries,
+                                  _cuda_stream);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    // this gives us the number of queries up to that point. eg How many query starts we have at each index
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  query_lengths.data(), // this is the vector of encoded lengths
+                                  query_starts.data(),
+                                  n_queries,
+                                  _cuda_stream);
+
+    // paper uses the ends and finds the beginnings with x - w + 1, are we converting to that here?
+    // TODO VI: I'm not entirely sure what this is for? I think we want to change the read query
+    // (defined by [query_start, query_start + query_length] to [query_end - query_length + 1, query_end])
+    // The above () is NOT true
+    convert_offsets_to_ends<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(),  // this gives how many starts at each index
+                                                                                           query_lengths.data(), // this is the vector of encoded lengths
+                                                                                           query_ends.data(),
+                                                                                           n_queries);
+
+    // what case is this? A: This always runs because TILE_SIZE is fixed at 1024
+    if (tile_size > 0)
+    {
+        calculate_tiles_per_read<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_lengths.data(), n_queries, tile_size, tiles_per_query.data());
+        // This stores the total number of tiles that we need, calculated with the below Sum()
+        device_buffer<int32_t> d_n_query_tiles(1, _allocator, _cuda_stream);
+
+        d_temp_storage     = nullptr;
+        temp_storage_bytes = 0;
+        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries);
+
+        d_temp_buf.clear_and_resize(temp_storage_bytes);
+        d_temp_storage = d_temp_buf.data();
+        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries);
+
+        // this is the length of tile_starts
+        n_query_tiles = cudautils::get_value_from_device(d_n_query_tiles.data(), _cuda_stream);
+
+        // This is used to calculate the offsets for tile_starts
+        device_buffer<int32_t> d_tiles_per_query_up_to_point(n_queries, _allocator, _cuda_stream);
+
+        d_temp_storage     = nullptr;
+        temp_storage_bytes = 0;
+        cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      tiles_per_query.data(), // this is the vector of encoded lengths
+                                      d_tiles_per_query_up_to_point.data(),
+                                      n_queries,
+                                      _cuda_stream);
+
+        d_temp_buf.clear_and_resize(temp_storage_bytes);
+        d_temp_storage = d_temp_buf.data();
+
+        cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      tiles_per_query.data(), // this is the vector of encoded lengths
+                                      d_tiles_per_query_up_to_point.data(),
+                                      n_queries,
+                                      _cuda_stream);
+
+        calculate_tile_starts<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), tiles_per_query.data(), tile_starts.data(), tile_size, n_queries, d_tiles_per_query_up_to_point.data());
+    }
+}
+
+void encode_anchor_query_target_pairs(const Anchor* anchors,
+                                      int32_t n_anchors,
+                                      int32_t tile_size,
+                                      device_buffer<int32_t>& query_target_starts,
+                                      device_buffer<int32_t>& query_target_lengths,
+                                      device_buffer<int32_t>& query_target_ends,
+                                      device_buffer<int32_t>& tiles_per_read,
+                                      int32_t& n_query_target_pairs,
+                                      int32_t& n_qt_tiles,
+                                      DefaultDeviceAllocator& _allocator,
+                                      cudaStream_t& _cuda_stream,
+                                      int32_t block_size)
+{
+    AnchorToQueryTargetPairOp qt_pair_op;
+    cub::TransformInputIterator<QueryTargetPair, AnchorToQueryTargetPairOp, const Anchor*> d_query_target_pairs(anchors, qt_pair_op);
+    device_buffer<QueryTargetPair> d_qt_pairs(n_anchors, _allocator, _cuda_stream);
+    device_buffer<int32_t> d_num_query_target_pairs(1, _allocator, _cuda_stream);
+
+    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
+    void* d_temp_storage           = nullptr;
+    std::size_t temp_storage_bytes = 0;
+
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_query_target_pairs,
+                                       d_qt_pairs.data(),
+                                       query_target_lengths.data(),
+                                       d_num_query_target_pairs.data(),
+                                       n_anchors);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_query_target_pairs,
+                                       d_qt_pairs.data(),
+                                       query_target_lengths.data(), // this is the vector of encoded lengths
+                                       d_num_query_target_pairs.data(),
+                                       n_anchors);
+
+    n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream);
+
+    d_temp_storage     = nullptr;
+    temp_storage_bytes = 0;
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  query_target_lengths.data(),
+                                  query_target_starts.data(),
+                                  n_query_target_pairs,
+                                  _cuda_stream);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  query_target_lengths.data(),
+                                  query_target_starts.data(),
+                                  n_query_target_pairs, _cuda_stream);
+
+    convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(),
+                                                                                                      query_target_lengths.data(),
+                                                                                                      query_target_ends.data(),
+                                                                                                      n_query_target_pairs);
+
+    if (tile_size > 0)
+    {
+        calculate_tiles_per_read<<<(n_query_target_pairs / block_size) + 1, 32, 0, _cuda_stream>>>(query_target_starts.data(), n_query_target_pairs, tile_size, tiles_per_read.data());
+        device_buffer<int32_t> d_n_qt_tiles(1, _allocator, _cuda_stream);
+
+        d_temp_storage     = nullptr;
+        temp_storage_bytes = 0;
+        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs);
+        d_temp_buf.clear_and_resize(temp_storage_bytes);
+        d_temp_storage = d_temp_buf.data();
+        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs);
+        n_qt_tiles = cudautils::get_value_from_device(d_n_qt_tiles.data(), _cuda_stream);
+    }
+}
+
+void encode_overlap_query_target_pairs(Overlap* overlaps,
+                                       int32_t n_overlaps,
+                                       device_buffer<int32_t>& query_target_starts,
+                                       device_buffer<int32_t>& query_target_lengths,
+                                       device_buffer<int32_t>& query_target_ends,
+                                       int32_t& n_query_target_pairs,
+                                       DefaultDeviceAllocator& _allocator,
+                                       cudaStream_t& _cuda_stream,
+                                       int32_t block_size)
+{
+    OverlapToQueryTargetPairOp qt_pair_op;
+    cub::TransformInputIterator<QueryTargetPair, OverlapToQueryTargetPairOp, Overlap*> d_query_target_pairs(overlaps, qt_pair_op);
+    device_buffer<QueryTargetPair> d_qt_pairs(n_overlaps, _allocator, _cuda_stream);
+    device_buffer<int32_t> d_num_query_target_pairs(1, _allocator, _cuda_stream);
+
+    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
+    void* d_temp_storage           = nullptr;
+    std::size_t temp_storage_bytes = 0;
+
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_query_target_pairs,
+                                       d_qt_pairs.data(),
+                                       query_target_lengths.data(),
+                                       d_num_query_target_pairs.data(),
+                                       n_overlaps);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
+                                       temp_storage_bytes,
+                                       d_query_target_pairs,
+                                       d_qt_pairs.data(),
+                                       query_target_lengths.data(),
+                                       d_num_query_target_pairs.data(),
+                                       n_overlaps);
+
+    n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream);
+
+    d_temp_storage     = nullptr;
+    temp_storage_bytes = 0;
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  query_target_lengths.data(),
+                                  query_target_starts.data(),
+                                  n_query_target_pairs, _cuda_stream);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  query_target_lengths.data(),
+                                  query_target_starts.data(),
+                                  n_query_target_pairs, _cuda_stream);
+
+    convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(), query_target_lengths.data(), query_target_ends.data(), n_query_target_pairs);
+}
+
+} // namespace chainerutils
+} // namespace cudamapper
+} // namespace genomeworks
+} // namespace claraparabricks
\ No newline at end of file
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
new file mode 100644
index 000000000..ff077de9c
--- /dev/null
+++ b/cudamapper/src/chainer_utils.cuh
@@ -0,0 +1,183 @@
+/*
+* Copyright 2020 NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#pragma once
+
+#include <fstream>
+#include <sstream>
+#include <cstdlib>
+
+// Needed for accumulate - remove when ported to cuda
+#include <numeric>
+#include <limits>
+
+#include <claraparabricks/genomeworks/cudamapper/types.hpp>
+#include <claraparabricks/genomeworks/utils/device_buffer.hpp>
+
+namespace claraparabricks
+{
+
+namespace genomeworks
+{
+
+namespace cudamapper
+{
+namespace chainerutils
+{
+
+#define MAX_CHAINS_PER_TILE 5
+
+struct QueryTargetPair
+{
+    int32_t query_read_id_;
+    int32_t target_read_id_;
+    __device__ QueryTargetPair() {}
+};
+
+struct QueryReadID
+{
+    int32_t query_read_id_;
+    __device__ QueryReadID(){};
+};
+
+// takes the anchor and returns the query read id
+struct AnchorToQueryReadIDOp
+{
+    __device__ __forceinline__ QueryReadID operator()(const Anchor& a) const
+    {
+        QueryReadID query;
+        query.query_read_id_ = a.query_read_id_;
+        return query;
+    }
+};
+
+__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b);
+
+struct OverlapToQueryTargetPairOp
+{
+    __device__ __forceinline__ QueryTargetPair operator()(const Overlap& a) const
+    {
+        QueryTargetPair p;
+        p.query_read_id_  = a.query_read_id_;
+        p.target_read_id_ = a.target_read_id_;
+        return p;
+    }
+};
+
+struct AnchorToQueryTargetPairOp
+{
+    __device__ __forceinline__ QueryTargetPair operator()(const Anchor& a) const
+    {
+        QueryTargetPair p;
+        p.query_read_id_  = a.query_read_id_;
+        p.target_read_id_ = a.target_read_id_;
+        return p;
+    }
+};
+
+struct OverlapToNumResiduesOp
+{
+    __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const
+    {
+        return overlap.num_residues_;
+    }
+};
+
+__device__ bool
+operator==(const QueryTargetPair& a, const QueryTargetPair& b);
+
+__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
+                                              Overlap* overlaps,
+                                              double* scores,
+                                              bool* max_select_mask,
+                                              int32_t* predecessors,
+                                              const int32_t n_anchors,
+                                              const int32_t min_score);
+///
+///@brief Allocate a 1-dimensional array representing an unrolled 2D-array
+/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than
+/// copy the anchors, the final array holds the indices of the global index array.
+///
+///@param overlaps An array of Overlaps. Must have a well-formed num_residues_ field.
+///@param unrolled_anchor_chains  An array of int32_t. Will be resided on return.
+///@param num_overlaps The number of overlaps in the overlaps array.
+///@param _allocator  The DefaultDeviceAllocator
+///@param _cuda_stream The cudastream to allocate memory within.
+void allocate_anchor_chains(device_buffer<Overlap> overlaps,
+                            device_buffer<int32_t>& unrolled_anchor_chains,
+                            const int32_t num_overlaps,
+                            int32_t& num_total_anchors,
+                            DefaultDeviceAllocator& _allocator,
+                            cudaStream_t& _cuda_stream);
+
+__global__ void output_overlap_chains(const Overlap* overlaps,
+                                      const Anchor* anchors,
+                                      const bool* select_mask,
+                                      const int32_t* predecessors,
+                                      int32_t* anchor_chains,
+                                      int32_t* anchor_chain_starts,
+                                      int32_t num_overlaps,
+                                      bool check_mask);
+
+__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts);
+
+__global__ void calculate_tile_starts(const std::int32_t* query_starts,
+                                      const std::int32_t* tiles_per_query,
+                                      std::int32_t* tile_starts,
+                                      const int32_t tile_size,
+                                      int32_t num_queries,
+                                      const std::int32_t* tiles_per_query_up_to_point);
+
+void encode_anchor_query_locations(const Anchor* anchors,
+                                   int32_t n_anchors,
+                                   int32_t tile_size,
+                                   device_buffer<int32_t>& query_starts,
+                                   device_buffer<int32_t>& query_lengths,
+                                   device_buffer<int32_t>& query_ends,
+                                   device_buffer<int32_t>& tiles_per_query,
+                                   device_buffer<int32_t>& tile_starts,
+                                   int32_t& n_queries,
+                                   int32_t& n_query_tiles,
+                                   DefaultDeviceAllocator& _allocator,
+                                   cudaStream_t& _cuda_stream,
+                                   int32_t block_size);
+
+void encode_anchor_query_target_pairs(const Anchor* anchors,
+                                      int32_t n_anchors,
+                                      int32_t tile_size,
+                                      device_buffer<int32_t>& query_target_pair_starts,
+                                      device_buffer<int32_t>& query_target_pair_lengths,
+                                      device_buffer<int32_t>& query_target_pair_ends,
+                                      device_buffer<int32_t>& tiles_per_qt_pair,
+                                      int32_t& n_query_target_pairs,
+                                      int32_t& n_qt_tiles,
+                                      DefaultDeviceAllocator& _allocator,
+                                      cudaStream_t& _cuda_stream,
+                                      int32_t block_size);
+
+void encode_overlap_query_target_pairs(Overlap* overlaps,
+                                       int32_t n_overlaps,
+                                       device_buffer<int32_t>& query_target_pair_starts,
+                                       device_buffer<int32_t>& query_target_pair_lengths,
+                                       device_buffer<int32_t>& query_target_pair_ends,
+                                       int32_t& n_query_target_pairs,
+                                       DefaultDeviceAllocator& _allocator,
+                                       cudaStream_t& _cuda_stream,
+                                       int32_t block_size);
+} // namespace chainerutils
+} // namespace cudamapper
+} // namespace genomeworks
+} // namespace claraparabricks
\ No newline at end of file

From 19040dcb610f4241b49f0ea684765bc0a90cd792 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 19 Oct 2020 17:37:02 -0400
Subject: [PATCH 02/26] [cudamapper] Add anchor chain trace for RLE and add
 anchor chain start calculation to output_overlap_chains_by_backtrace.

---
 cudamapper/src/chainer_utils.cu  | 409 +++++--------------------------
 cudamapper/src/chainer_utils.cuh |  72 ++----
 2 files changed, 76 insertions(+), 405 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 97dde816d..3b00a4c1a 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -79,6 +79,7 @@ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end,
 
 void allocate_anchor_chains(device_buffer<Overlap> overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
+                            device_buffer<int32_t>& anchor_chain_starts,
                             const int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator& _allocator,
@@ -92,13 +93,36 @@ void allocate_anchor_chains(device_buffer<Overlap> overlaps,
     cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, Overlap*> d_residue_counts(overlaps.data(), overlap_residue_count_op);
 
     device_buffer<int32_t> d_num_total_anchors(1, _allocator, _cuda_stream);
-    device_buffer<int32_t> d_chain_starts(num_overlaps, _allocator, _cuda_stream);  // Index of the first anchor in the anchors array for an overlap.
-    device_buffer<int32_t> d_chain_lengths(num_overlaps, _allocator, _cuda_stream); // Equivalent to num_residues
+
+    cub::DeviceReduce::Sum(d_temp_storage,
+                           temp_storage_bytes,
+                           d_residue_counts,
+                           d_num_total_anchors.data(),
+                           num_overlaps,
+                           _cuda_stream);
+
+    d_temp_buf.clear_and_resize(temp_storage_bytes);
+    d_temp_storage = d_temp_buf.data();
+
+    cub::DeviceReduce::Sum(d_temp_storage,
+                           temp_storage_bytes,
+                           d_residue_counts,
+                           d_num_total_anchors.data(),
+                           num_overlaps,
+                           _cuda_stream);
+
+    d_temp_storage     = nullptr;
+    temp_storage_bytes = 0;
+
+    num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), _cuda_stream);
+
+    unrolled_anchor_chains.clear_and_resize(num_total_anchors);
+    anchor_chain_starts.clear_and_resize(num_overlaps);
 
     cub::DeviceScan::ExclusiveSum(d_temp_storage,
                                   temp_storage_bytes,
-                                  d_chain_lengths.data(),
-                                  d_chain_starts.data(),
+                                  d_residue_counts,
+                                  anchor_chain_starts.data(),
                                   num_overlaps,
                                   _cuda_stream);
 
@@ -107,26 +131,41 @@ void allocate_anchor_chains(device_buffer<Overlap> overlaps,
 
     cub::DeviceScan::ExclusiveSum(d_temp_storage,
                                   temp_storage_bytes,
-                                  d_chain_lengths.data(),
-                                  d_chain_starts.data(),
+                                  d_residue_counts,
+                                  anchor_chain_starts.data(),
                                   num_overlaps,
                                   _cuda_stream);
+}
 
-    int32_t last_start  = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream);
-    int32_t last_length = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream);
-    num_total_anchors   = last_start + last_length;
-
-    unrolled_anchor_chains.clear_and_resize(num_total_anchors);
+__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
+                                             const Anchor* anchors,
+                                             const int32_t* chain_starts,
+                                             const int32_t* chain_lengths,
+                                             int32_t* anchor_chains,
+                                             int32_t* anchor_chain_starts,
+                                             int32_t num_overlaps)
+{
+    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int32_t stride      = blockDim.x * gridDim.x;
+    for (int i = d_thread_id; i < num_overlaps; i += stride)
+    {
+        int32_t chain_start  = chain_starts[i];
+        int32_t chain_length = chain_lengths[i];
+        for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i)
+        {
+            anchor_chains[ind] = ind;
+        }
+    }
 }
 
-__global__ void output_overlap_chains(const Overlap* overlaps,
-                                      const Anchor* anchors,
-                                      const bool* select_mask,
-                                      const int32_t* predecessors,
-                                      int32_t* anchor_chains,
-                                      int32_t* anchor_chain_starts,
-                                      int32_t num_overlaps,
-                                      bool check_mask)
+__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
+                                                   const Anchor* anchors,
+                                                   const bool* select_mask,
+                                                   const int32_t* predecessors,
+                                                   int32_t* anchor_chains,
+                                                   int32_t* anchor_chain_starts,
+                                                   int32_t num_overlaps,
+                                                   bool check_mask)
 {
     int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     int32_t stride      = blockDim.x * gridDim.x;
@@ -188,12 +227,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
             }
             Anchor first_anchor            = anchors[first_index];
             overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
-            // Overlap final_overlap          = overlaps[global_overlap_index];
-            // printf("%d %d %d %d %d %d %d %f\n",
-            //        final_overlap.query_read_id_, final_overlap.query_start_position_in_read_, final_overlap.query_end_position_in_read_,
-            //        final_overlap.target_read_id_, final_overlap.target_start_position_in_read_, final_overlap.target_end_position_in_read_,
-            //        final_overlap.num_residues_,
-            //        final_score);
         }
         else
         {
@@ -202,332 +235,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
     }
 }
 
-__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts)
-{
-    std::int32_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (d_tid < n_starts)
-    {
-        ends[d_tid] = starts[d_tid] + lengths[d_tid] - 1;
-    }
-}
-
-// we take each query that is encoded as length (ie there are 4 queries that have the same id)
-// so we take the (how many queries are in that section) and divide it by the size of the tile to get the
-// number of tiles for that query
-__global__ void calculate_tiles_per_read(const std::int32_t* lengths,
-                                         const int32_t num_reads,
-                                         const int32_t tile_size,
-                                         std::int32_t* tiles_per_read)
-{
-    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (d_thread_id < num_reads)
-    {
-        // if the number of queries in that section are not evenly divisible, we need an extra block
-        // to accomadate for the leftovers
-        int32_t n_integer_blocks    = lengths[d_thread_id] / tile_size;
-        int32_t remainder           = lengths[d_thread_id] % tile_size;
-        tiles_per_read[d_thread_id] = remainder == 0 ? n_integer_blocks : n_integer_blocks + 1;
-    }
-}
-
-// TODO VI: threads may be overwriting each other.
-// because the number of queries is greater than number of tiles, each thread here has to write the query starts
-// for multiple tiles
-__global__ void calculate_tile_starts(const std::int32_t* query_starts,
-                                      const std::int32_t* tiles_per_query,
-                                      std::int32_t* tile_starts,
-                                      const int32_t tile_size,
-                                      const int32_t num_queries,
-                                      const std::int32_t* tiles_per_query_up_to_point)
-{
-    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int32_t stride      = blockDim.x * gridDim.x;
-    if (d_thread_id < num_queries)
-    {
-        // for each tile, we look up the query it corresponds to and offset it by the which tile in the query
-        // we're at multiplied by the total size of the tile
-        // TODO VI: this memory access pattern seems a little off? Thread i and thread i+1 would overwrite eachother, no?
-        for (int i = 0; i < tiles_per_query[d_thread_id]; i++)
-        {
-            //finds the offset in the ragged array and offsets to find start of "next" sub array
-            tile_starts[tiles_per_query_up_to_point[d_thread_id] + i] = query_starts[d_thread_id] + (i * tile_size);
-        }
-    }
-    //int counter = 0;
-    //for (int i =0; i < num_queries; i++)
-    //{
-    //    for (int j = 0; j < tiles_per_query[i]; j++)
-    //    {
-    //        tile_starts[counter] = query_starts[i] + (j * tile_size);
-    //        counter++;
-    //    }
-    //}
-}
-
-void encode_anchor_query_locations(const Anchor* anchors,
-                                   int32_t n_anchors,
-                                   int32_t tile_size,
-                                   device_buffer<int32_t>& query_starts,
-                                   device_buffer<int32_t>& query_lengths,
-                                   device_buffer<int32_t>& query_ends,
-                                   device_buffer<int32_t>& tiles_per_query,
-                                   device_buffer<int32_t>& tile_starts,
-                                   int32_t& n_queries,
-                                   int32_t& n_query_tiles,
-                                   DefaultDeviceAllocator& _allocator,
-                                   cudaStream_t& _cuda_stream,
-                                   int32_t block_size)
-{
-    AnchorToQueryReadIDOp anchor_to_read_op;
-    // This takes anchors and outputs and converts the anchors to QueryReadID types (references)
-    cub::TransformInputIterator<QueryReadID, AnchorToQueryReadIDOp, const Anchor*> d_queries(anchors, anchor_to_read_op);
-    // create buffer of size number of anchors
-    device_buffer<QueryReadID> d_query_read_ids(n_anchors, _allocator, _cuda_stream);
-    // vector of number of read ids...? I don't know what this is for
-    // This is used to store the length of the encoded sequence
-    device_buffer<int32_t> d_num_query_read_ids(1, _allocator, _cuda_stream);
-
-    // don't know what this is for yet
-    // this is for internal use for run length encoding
-    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
-    void* d_temp_storage           = nullptr;
-    std::size_t temp_storage_bytes = 0;
-
-    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_queries,
-                                       d_query_read_ids.data(),
-                                       query_lengths.data(),
-                                       d_num_query_read_ids.data(),
-                                       n_anchors);
-
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
-
-    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_queries,
-                                       d_query_read_ids.data(),
-                                       query_lengths.data(), // this is the vector of encoded lengths
-                                       d_num_query_read_ids.data(),
-                                       n_anchors);
-    // this is just the "length" of the encoded sequence
-    n_queries          = cudautils::get_value_from_device(d_num_query_read_ids.data(), _cuda_stream);
-    d_temp_storage     = nullptr;
-    temp_storage_bytes = 0;
-
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                  temp_storage_bytes,
-                                  query_lengths.data(), // this is the vector of encoded lengths
-                                  query_starts.data(),  // at this point, this vector is empty
-                                  n_queries,
-                                  _cuda_stream);
-
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
-
-    // this gives us the number of queries up to that point. eg How many query starts we have at each index
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                  temp_storage_bytes,
-                                  query_lengths.data(), // this is the vector of encoded lengths
-                                  query_starts.data(),
-                                  n_queries,
-                                  _cuda_stream);
-
-    // paper uses the ends and finds the beginnings with x - w + 1, are we converting to that here?
-    // TODO VI: I'm not entirely sure what this is for? I think we want to change the read query
-    // (defined by [query_start, query_start + query_length] to [query_end - query_length + 1, query_end])
-    // The above () is NOT true
-    convert_offsets_to_ends<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(),  // this gives how many starts at each index
-                                                                                           query_lengths.data(), // this is the vector of encoded lengths
-                                                                                           query_ends.data(),
-                                                                                           n_queries);
-
-    // what case is this? A: This always runs because TILE_SIZE is fixed at 1024
-    if (tile_size > 0)
-    {
-        calculate_tiles_per_read<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_lengths.data(), n_queries, tile_size, tiles_per_query.data());
-        // This stores the total number of tiles that we need, calculated with the below Sum()
-        device_buffer<int32_t> d_n_query_tiles(1, _allocator, _cuda_stream);
-
-        d_temp_storage     = nullptr;
-        temp_storage_bytes = 0;
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries);
-
-        d_temp_buf.clear_and_resize(temp_storage_bytes);
-        d_temp_storage = d_temp_buf.data();
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries);
-
-        // this is the length of tile_starts
-        n_query_tiles = cudautils::get_value_from_device(d_n_query_tiles.data(), _cuda_stream);
-
-        // This is used to calculate the offsets for tile_starts
-        device_buffer<int32_t> d_tiles_per_query_up_to_point(n_queries, _allocator, _cuda_stream);
-
-        d_temp_storage     = nullptr;
-        temp_storage_bytes = 0;
-        cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                      temp_storage_bytes,
-                                      tiles_per_query.data(), // this is the vector of encoded lengths
-                                      d_tiles_per_query_up_to_point.data(),
-                                      n_queries,
-                                      _cuda_stream);
-
-        d_temp_buf.clear_and_resize(temp_storage_bytes);
-        d_temp_storage = d_temp_buf.data();
-
-        cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                      temp_storage_bytes,
-                                      tiles_per_query.data(), // this is the vector of encoded lengths
-                                      d_tiles_per_query_up_to_point.data(),
-                                      n_queries,
-                                      _cuda_stream);
-
-        calculate_tile_starts<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), tiles_per_query.data(), tile_starts.data(), tile_size, n_queries, d_tiles_per_query_up_to_point.data());
-    }
-}
-
-void encode_anchor_query_target_pairs(const Anchor* anchors,
-                                      int32_t n_anchors,
-                                      int32_t tile_size,
-                                      device_buffer<int32_t>& query_target_starts,
-                                      device_buffer<int32_t>& query_target_lengths,
-                                      device_buffer<int32_t>& query_target_ends,
-                                      device_buffer<int32_t>& tiles_per_read,
-                                      int32_t& n_query_target_pairs,
-                                      int32_t& n_qt_tiles,
-                                      DefaultDeviceAllocator& _allocator,
-                                      cudaStream_t& _cuda_stream,
-                                      int32_t block_size)
-{
-    AnchorToQueryTargetPairOp qt_pair_op;
-    cub::TransformInputIterator<QueryTargetPair, AnchorToQueryTargetPairOp, const Anchor*> d_query_target_pairs(anchors, qt_pair_op);
-    device_buffer<QueryTargetPair> d_qt_pairs(n_anchors, _allocator, _cuda_stream);
-    device_buffer<int32_t> d_num_query_target_pairs(1, _allocator, _cuda_stream);
-
-    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
-    void* d_temp_storage           = nullptr;
-    std::size_t temp_storage_bytes = 0;
-
-    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_query_target_pairs,
-                                       d_qt_pairs.data(),
-                                       query_target_lengths.data(),
-                                       d_num_query_target_pairs.data(),
-                                       n_anchors);
-
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
-
-    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_query_target_pairs,
-                                       d_qt_pairs.data(),
-                                       query_target_lengths.data(), // this is the vector of encoded lengths
-                                       d_num_query_target_pairs.data(),
-                                       n_anchors);
-
-    n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream);
-
-    d_temp_storage     = nullptr;
-    temp_storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                  temp_storage_bytes,
-                                  query_target_lengths.data(),
-                                  query_target_starts.data(),
-                                  n_query_target_pairs,
-                                  _cuda_stream);
-
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
-
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                  temp_storage_bytes,
-                                  query_target_lengths.data(),
-                                  query_target_starts.data(),
-                                  n_query_target_pairs, _cuda_stream);
-
-    convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(),
-                                                                                                      query_target_lengths.data(),
-                                                                                                      query_target_ends.data(),
-                                                                                                      n_query_target_pairs);
-
-    if (tile_size > 0)
-    {
-        calculate_tiles_per_read<<<(n_query_target_pairs / block_size) + 1, 32, 0, _cuda_stream>>>(query_target_starts.data(), n_query_target_pairs, tile_size, tiles_per_read.data());
-        device_buffer<int32_t> d_n_qt_tiles(1, _allocator, _cuda_stream);
-
-        d_temp_storage     = nullptr;
-        temp_storage_bytes = 0;
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs);
-        d_temp_buf.clear_and_resize(temp_storage_bytes);
-        d_temp_storage = d_temp_buf.data();
-        cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs);
-        n_qt_tiles = cudautils::get_value_from_device(d_n_qt_tiles.data(), _cuda_stream);
-    }
-}
-
-void encode_overlap_query_target_pairs(Overlap* overlaps,
-                                       int32_t n_overlaps,
-                                       device_buffer<int32_t>& query_target_starts,
-                                       device_buffer<int32_t>& query_target_lengths,
-                                       device_buffer<int32_t>& query_target_ends,
-                                       int32_t& n_query_target_pairs,
-                                       DefaultDeviceAllocator& _allocator,
-                                       cudaStream_t& _cuda_stream,
-                                       int32_t block_size)
-{
-    OverlapToQueryTargetPairOp qt_pair_op;
-    cub::TransformInputIterator<QueryTargetPair, OverlapToQueryTargetPairOp, Overlap*> d_query_target_pairs(overlaps, qt_pair_op);
-    device_buffer<QueryTargetPair> d_qt_pairs(n_overlaps, _allocator, _cuda_stream);
-    device_buffer<int32_t> d_num_query_target_pairs(1, _allocator, _cuda_stream);
-
-    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
-    void* d_temp_storage           = nullptr;
-    std::size_t temp_storage_bytes = 0;
-
-    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_query_target_pairs,
-                                       d_qt_pairs.data(),
-                                       query_target_lengths.data(),
-                                       d_num_query_target_pairs.data(),
-                                       n_overlaps);
-
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
-
-    cub::DeviceRunLengthEncode::Encode(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_query_target_pairs,
-                                       d_qt_pairs.data(),
-                                       query_target_lengths.data(),
-                                       d_num_query_target_pairs.data(),
-                                       n_overlaps);
-
-    n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream);
-
-    d_temp_storage     = nullptr;
-    temp_storage_bytes = 0;
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                  temp_storage_bytes,
-                                  query_target_lengths.data(),
-                                  query_target_starts.data(),
-                                  n_query_target_pairs, _cuda_stream);
-
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
-
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
-                                  temp_storage_bytes,
-                                  query_target_lengths.data(),
-                                  query_target_starts.data(),
-                                  n_query_target_pairs, _cuda_stream);
-
-    convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(), query_target_lengths.data(), query_target_ends.data(), n_query_target_pairs);
-}
-
 } // namespace chainerutils
 } // namespace cudamapper
 } // namespace genomeworks
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index ff077de9c..8f4b09180 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -118,65 +118,29 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
 ///@param _cuda_stream The cudastream to allocate memory within.
 void allocate_anchor_chains(device_buffer<Overlap> overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
+                            device_buffer<int32_t>& anchor_chain_starts,
                             const int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator& _allocator,
                             cudaStream_t& _cuda_stream);
 
-__global__ void output_overlap_chains(const Overlap* overlaps,
-                                      const Anchor* anchors,
-                                      const bool* select_mask,
-                                      const int32_t* predecessors,
-                                      int32_t* anchor_chains,
-                                      int32_t* anchor_chain_starts,
-                                      int32_t num_overlaps,
-                                      bool check_mask);
-
-__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts);
-
-__global__ void calculate_tile_starts(const std::int32_t* query_starts,
-                                      const std::int32_t* tiles_per_query,
-                                      std::int32_t* tile_starts,
-                                      const int32_t tile_size,
-                                      int32_t num_queries,
-                                      const std::int32_t* tiles_per_query_up_to_point);
-
-void encode_anchor_query_locations(const Anchor* anchors,
-                                   int32_t n_anchors,
-                                   int32_t tile_size,
-                                   device_buffer<int32_t>& query_starts,
-                                   device_buffer<int32_t>& query_lengths,
-                                   device_buffer<int32_t>& query_ends,
-                                   device_buffer<int32_t>& tiles_per_query,
-                                   device_buffer<int32_t>& tile_starts,
-                                   int32_t& n_queries,
-                                   int32_t& n_query_tiles,
-                                   DefaultDeviceAllocator& _allocator,
-                                   cudaStream_t& _cuda_stream,
-                                   int32_t block_size);
-
-void encode_anchor_query_target_pairs(const Anchor* anchors,
-                                      int32_t n_anchors,
-                                      int32_t tile_size,
-                                      device_buffer<int32_t>& query_target_pair_starts,
-                                      device_buffer<int32_t>& query_target_pair_lengths,
-                                      device_buffer<int32_t>& query_target_pair_ends,
-                                      device_buffer<int32_t>& tiles_per_qt_pair,
-                                      int32_t& n_query_target_pairs,
-                                      int32_t& n_qt_tiles,
-                                      DefaultDeviceAllocator& _allocator,
-                                      cudaStream_t& _cuda_stream,
-                                      int32_t block_size);
-
-void encode_overlap_query_target_pairs(Overlap* overlaps,
-                                       int32_t n_overlaps,
-                                       device_buffer<int32_t>& query_target_pair_starts,
-                                       device_buffer<int32_t>& query_target_pair_lengths,
-                                       device_buffer<int32_t>& query_target_pair_ends,
-                                       int32_t& n_query_target_pairs,
-                                       DefaultDeviceAllocator& _allocator,
-                                       cudaStream_t& _cuda_stream,
-                                       int32_t block_size);
+__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
+                                                   const Anchor* anchors,
+                                                   const bool* select_mask,
+                                                   const int32_t* predecessors,
+                                                   int32_t* anchor_chains,
+                                                   int32_t* anchor_chain_starts,
+                                                   int32_t num_overlaps,
+                                                   bool check_mask);
+
+__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
+                                             const Anchor* anchors,
+                                             const int32_t* chain_starts,
+                                             const int32_t* chain_lengths,
+                                             int32_t* anchor_chains,
+                                             int32_t* anchor_chain_starts,
+                                             int32_t num_overlaps);
+
 } // namespace chainerutils
 } // namespace cudamapper
 } // namespace genomeworks

From 233cfc22e82ca3dea34d58c4fc83f404d230dcd9 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 19 Oct 2020 17:40:08 -0400
Subject: [PATCH 03/26] [cudamapper] Clean up unnecessary functions in
 chainer_utils.

---
 cudamapper/src/chainer_utils.cu  | 10 ------
 cudamapper/src/chainer_utils.cuh | 53 --------------------------------
 2 files changed, 63 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 3b00a4c1a..e2af3d443 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -40,16 +40,6 @@ namespace cudamapper
 namespace chainerutils
 {
 
-__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b)
-{
-    return a.query_read_id_ == b.query_read_id_ && a.target_read_id_ == b.target_read_id_;
-}
-
-__device__ bool operator==(const QueryReadID& a, const QueryReadID& b)
-{
-    return a.query_read_id_ == b.query_read_id_;
-}
-
 __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
 {
     Overlap overlap;
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 8f4b09180..f2862920c 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -38,56 +38,6 @@ namespace cudamapper
 namespace chainerutils
 {
 
-#define MAX_CHAINS_PER_TILE 5
-
-struct QueryTargetPair
-{
-    int32_t query_read_id_;
-    int32_t target_read_id_;
-    __device__ QueryTargetPair() {}
-};
-
-struct QueryReadID
-{
-    int32_t query_read_id_;
-    __device__ QueryReadID(){};
-};
-
-// takes the anchor and returns the query read id
-struct AnchorToQueryReadIDOp
-{
-    __device__ __forceinline__ QueryReadID operator()(const Anchor& a) const
-    {
-        QueryReadID query;
-        query.query_read_id_ = a.query_read_id_;
-        return query;
-    }
-};
-
-__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b);
-
-struct OverlapToQueryTargetPairOp
-{
-    __device__ __forceinline__ QueryTargetPair operator()(const Overlap& a) const
-    {
-        QueryTargetPair p;
-        p.query_read_id_  = a.query_read_id_;
-        p.target_read_id_ = a.target_read_id_;
-        return p;
-    }
-};
-
-struct AnchorToQueryTargetPairOp
-{
-    __device__ __forceinline__ QueryTargetPair operator()(const Anchor& a) const
-    {
-        QueryTargetPair p;
-        p.query_read_id_  = a.query_read_id_;
-        p.target_read_id_ = a.target_read_id_;
-        return p;
-    }
-};
-
 struct OverlapToNumResiduesOp
 {
     __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const
@@ -96,9 +46,6 @@ struct OverlapToNumResiduesOp
     }
 };
 
-__device__ bool
-operator==(const QueryTargetPair& a, const QueryTargetPair& b);
-
 __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
                                               Overlap* overlaps,
                                               double* scores,

From 1cc1c25b82fb0f0c85badb66937ccb6d3acf57a4 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 19 Oct 2020 22:54:25 -0400
Subject: [PATCH 04/26] [cudamapper] Add tests for allocating anchor chains in
 chainer_utils. However, tests fail because the allocator used in testing
 can't actually allocate memory.

---
 cudamapper/src/chainer_utils.cu  | 6 +++---
 cudamapper/src/chainer_utils.cuh | 8 ++++++--
 cudamapper/tests/CMakeLists.txt  | 1 +
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index e2af3d443..16f0f6d72 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -40,7 +40,7 @@ namespace cudamapper
 namespace chainerutils
 {
 
-__device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
+__host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
 {
     Overlap overlap;
     overlap.num_residues_ = num_anchors;
@@ -67,10 +67,10 @@ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end,
     return overlap;
 }
 
-void allocate_anchor_chains(device_buffer<Overlap> overlaps,
+void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            const int32_t num_overlaps,
+                            int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator& _allocator,
                             cudaStream_t& _cuda_stream)
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index f2862920c..5b5ca7366 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -46,6 +46,10 @@ struct OverlapToNumResiduesOp
     }
 };
 
+__host__ __device__ Overlap create_simple_overlap(const Anchor& start,
+                                                  const Anchor& end,
+                                                  const int32_t num_anchors);
+
 __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
                                               Overlap* overlaps,
                                               double* scores,
@@ -63,10 +67,10 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
 ///@param num_overlaps The number of overlaps in the overlaps array.
 ///@param _allocator  The DefaultDeviceAllocator
 ///@param _cuda_stream The cudastream to allocate memory within.
-void allocate_anchor_chains(device_buffer<Overlap> overlaps,
+void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            const int32_t num_overlaps,
+                            int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator& _allocator,
                             cudaStream_t& _cuda_stream);
diff --git a/cudamapper/tests/CMakeLists.txt b/cudamapper/tests/CMakeLists.txt
index da8617978..acfb03a00 100644
--- a/cudamapper/tests/CMakeLists.txt
+++ b/cudamapper/tests/CMakeLists.txt
@@ -29,6 +29,7 @@ set(SOURCES
     Test_CudamapperOverlapper.cpp
     Test_CudamapperOverlapperTriggered.cu
     Test_CudamapperUtilsKmerFunctions.cpp
+    Test_CudamapperChainerUtils.cu
    )
 
 get_property(cudamapper_data_include_dir GLOBAL PROPERTY cudamapper_data_include_dir)

From 6680ba6f44a80990db50816a78451d56c886a77a Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Tue, 20 Oct 2020 11:01:20 -0400
Subject: [PATCH 05/26] Add tests for chainerutils.

---
 .../tests/Test_CudamapperChainerUtils.cu      | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 cudamapper/tests/Test_CudamapperChainerUtils.cu

diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
new file mode 100644
index 000000000..6348c1602
--- /dev/null
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -0,0 +1,139 @@
+/*
+* Copyright 2019-2020 NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "gtest/gtest.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <vector>
+
+#include <claraparabricks/genomeworks/cudamapper/types.hpp>
+#include "../src/chainer_utils.cuh"
+
+namespace claraparabricks
+{
+
+namespace genomeworks
+{
+
+namespace cudamapper
+{
+
+TEST(TestChainerUtils, Create_Simple_Overlap_Tests)
+{
+    Anchor a;
+    a.query_read_id_           = 12;
+    a.query_position_in_read_  = 130;
+    a.target_read_id_          = 46;
+    a.target_position_in_read_ = 10320;
+
+    Anchor b;
+    b.query_read_id_           = 12;
+    b.query_position_in_read_  = 10000;
+    b.target_read_id_          = 46;
+    b.target_position_in_read_ = 20400;
+
+    Overlap p_ab = chainerutils::create_simple_overlap(a, b, 12);
+    ASSERT_EQ(p_ab.query_read_id_, static_cast<uint32_t>(12));
+    ASSERT_EQ(p_ab.target_read_id_, 46);
+    ASSERT_EQ(p_ab.query_start_position_in_read_, 130);
+    ASSERT_EQ(p_ab.query_end_position_in_read_, 10000);
+    ASSERT_EQ(p_ab.target_start_position_in_read_, 10320);
+    ASSERT_EQ(p_ab.num_residues_, 12);
+    ASSERT_EQ(static_cast<char>(p_ab.relative_strand), static_cast<char>(RelativeStrand::Forward));
+
+    Anchor c;
+    c.query_read_id_           = 12;
+    c.query_position_in_read_  = 15000;
+    c.target_read_id_          = 46;
+    c.target_position_in_read_ = 16000;
+
+    Overlap p_bc = chainerutils::create_simple_overlap(b, c, 22);
+    ASSERT_EQ(p_bc.target_start_position_in_read_, 16000);
+    ASSERT_EQ(p_bc.target_end_position_in_read_, 20400);
+    ASSERT_EQ(static_cast<char>(p_bc.relative_strand), static_cast<char>(RelativeStrand::Reverse));
+}
+
+TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
+{
+
+    DefaultDeviceAllocator allocator = create_default_device_allocator(2048);
+
+    CudaStream cuda_stream = make_cuda_stream();
+    auto cu_ptr            = cuda_stream.get();
+
+    Anchor a;
+    a.query_read_id_           = 12;
+    a.query_position_in_read_  = 130;
+    a.target_read_id_          = 46;
+    a.target_position_in_read_ = 10320;
+
+    Anchor b;
+    b.query_read_id_           = 12;
+    b.query_position_in_read_  = 10000;
+    b.target_read_id_          = 46;
+    b.target_position_in_read_ = 20400;
+
+    Overlap p_ab = chainerutils::create_simple_overlap(a, b, 2);
+
+    Anchor c;
+    c.query_read_id_           = 12;
+    c.query_position_in_read_  = 15000;
+    c.target_read_id_          = 46;
+    c.target_position_in_read_ = 16000;
+
+    Overlap p_bc = chainerutils::create_simple_overlap(b, c, 2);
+
+    std::vector<Anchor> anchors;
+    anchors.push_back(a);
+    anchors.push_back(b);
+    anchors.push_back(c);
+
+    std::vector<Overlap> overlaps;
+    overlaps.push_back(p_ab);
+    overlaps.push_back(p_bc);
+
+    device_buffer<Anchor> d_anchors(anchors.size(), allocator, cuda_stream.get());
+    device_buffer<Overlap> d_overlaps(overlaps.size(), allocator, cuda_stream.get());
+    cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data());
+    cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data());
+
+    // device_buffer<int32_t> unrolled_anchor_chains(1, allocator, cuda_stream.get());
+    // device_buffer<int32_t> chain_starts(1, allocator, cuda_stream.get());
+
+    device_buffer<int32_t> unrolled_anchor_chains;
+    device_buffer<int32_t> chain_starts;
+
+    int32_t num_total_anchors;
+    chainerutils::allocate_anchor_chains(d_overlaps,
+                                         unrolled_anchor_chains,
+                                         chain_starts,
+                                         overlaps.size(),
+                                         num_total_anchors,
+                                         allocator,
+                                         cu_ptr);
+    // void allocate_anchor_chains(device_buffer<Overlap> overlaps,
+    // device_buffer<int32_t>& unrolled_anchor_chains,
+    // device_buffer<int32_t>& anchor_chain_starts,
+    // const int32_t num_overlaps,
+    // int32_t& num_total_anchors,
+    // DefaultDeviceAllocator& _allocator,
+    // cudaStream_t& _cuda_stream);
+}
+
+} // namespace cudamapper
+} // namespace genomeworks
+} // namespace claraparabricks
\ No newline at end of file

From 5860fb1eca04f35de5fb1fd66fc38424cdf38e96 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Thu, 22 Oct 2020 09:00:39 -0400
Subject: [PATCH 06/26] [cudamapper] Remove unnecessary includes.

---
 cudamapper/src/chainer_utils.cu  | 10 ----------
 cudamapper/src/chainer_utils.cuh |  8 --------
 2 files changed, 18 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 16f0f6d72..aabe59e6f 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -16,17 +16,7 @@
 
 #include "chainer_utils.cuh"
 
-#include <fstream>
-#include <sstream>
-#include <cstdlib>
-
-// Needed for accumulate - remove when ported to cuda
-#include <numeric>
-#include <limits>
-
 #include <cub/cub.cuh>
-#include <thrust/execution_policy.h>
-
 #include <claraparabricks/genomeworks/utils/cudautils.hpp>
 
 namespace claraparabricks
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 5b5ca7366..b13b0c664 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -16,14 +16,6 @@
 
 #pragma once
 
-#include <fstream>
-#include <sstream>
-#include <cstdlib>
-
-// Needed for accumulate - remove when ported to cuda
-#include <numeric>
-#include <limits>
-
 #include <claraparabricks/genomeworks/cudamapper/types.hpp>
 #include <claraparabricks/genomeworks/utils/device_buffer.hpp>
 

From ef4ce77f532580589ed71608e8207ba99c882e42 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Thu, 22 Oct 2020 09:49:21 -0400
Subject: [PATCH 07/26] [chainer_utils] Update docs.

---
 cudamapper/src/chainer_utils.cu  |  2 +-
 cudamapper/src/chainer_utils.cuh | 82 +++++++++++++++++++++++++++++---
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index aabe59e6f..d8d6438c1 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -41,7 +41,7 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anc
 
     overlap.query_start_position_in_read_ = min(start.query_position_in_read_, end.query_position_in_read_);
     overlap.query_end_position_in_read_   = max(start.query_position_in_read_, end.query_position_in_read_);
-    bool is_negative_strand               = end.target_position_in_read_ < start.target_position_in_read_;
+    const bool is_negative_strand         = end.target_position_in_read_ < start.target_position_in_read_;
     if (is_negative_strand)
     {
         overlap.relative_strand                = RelativeStrand::Reverse;
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index b13b0c664..1ac07973e 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -38,10 +38,33 @@ struct OverlapToNumResiduesOp
     }
 };
 
+///
+/// \brief Create an overlap from the first and last anchor in the chain and
+/// the total number of anchors in the chain.
+///
+/// \param start The first anchor in the chain.
+/// \param end The last anchor in the chain.
+/// \param num_anchors The total number of anchors in the chain.
 __host__ __device__ Overlap create_simple_overlap(const Anchor& start,
                                                   const Anchor& end,
                                                   const int32_t num_anchors);
 
+///
+/// \brief Perform a backtrace on the predecessors array
+/// to generate overlaps from anchors. Anchors must have been chained
+/// by a chaining function that filles the predecessors and scores array.
+///
+/// \param anchors An array of anchors.
+/// \param overlaps An array of overlaps to be filled.
+/// \param scores An array of scores. Only chains with a score greater than
+/// min_score will be backtraced.
+/// \param max_select_mask A boolean mask, used to mask out any subchains during
+/// the backtrace.
+/// \param predecessors An array of indices into the anchors array marking the predecessor
+/// of each anchor within a chain.
+/// \param n_anchors The number of anchors.
+/// \param min_score The minimum score of a chain for performing backtracing.
+/// \return __global__
 __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
                                               Overlap* overlaps,
                                               double* scores,
@@ -50,15 +73,31 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
                                               const int32_t n_anchors,
                                               const int32_t min_score);
 ///
-///@brief Allocate a 1-dimensional array representing an unrolled 2D-array
+/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array
 /// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than
-/// copy the anchors, the final array holds the indices of the global index array.
+/// copy the anchors, the final array holds the indices within the anchors array
+/// of the anchors in the chain.
 ///
-///@param overlaps An array of Overlaps. Must have a well-formed num_residues_ field.
-///@param unrolled_anchor_chains  An array of int32_t. Will be resided on return.
-///@param num_overlaps The number of overlaps in the overlaps array.
+/// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field.
+/// \param unrolled_anchor_chains  An array of int32_t. Will be resided on return.
+/// param num_overlaps The number of overlaps in the overlaps array.
 ///@param _allocator  The DefaultDeviceAllocator
 ///@param _cuda_stream The cudastream to allocate memory within.
+
+///
+/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array
+/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than
+/// copy the anchors, the final array holds the indices within the anchors array
+/// of the anchors in the chain.
+///
+/// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field
+/// \param unrolled_anchor_chains  An array of int32_t. Will be resided on return.
+/// \param anchor_chain_starts An array holding the index in the anchors array of the first
+/// anchor in an overlap.
+/// \param num_overlaps The number of overlaps in the overlaps array.
+/// \param num_total_anchors The number of anchors in the anchors array.
+/// \param _allocator The DefaultDeviceAllocator for this overlapper.
+/// \param _cuda_stream The cudastream to allocate memory within.
 void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
@@ -67,6 +106,22 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             DefaultDeviceAllocator& _allocator,
                             cudaStream_t& _cuda_stream);
 
+///
+/// \brief Calculate the anchors chains used to produce each overlap in the
+/// overlap array for anchors chained by backtrace_anchors_to_overlaps.
+///
+/// \param overlaps An array of overlaps.
+/// \param anchors The array of anchors used to generate overlaps.
+/// \param select_mask An array of bools, used to mask overlaps from output.
+/// \param predecessors The predecessors array from anchor chaining.
+/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold
+/// the indices of anchors within each chain.
+/// \param anchor_chain_starts An array which holds the indices of the first anchor for
+/// each overlap in the overlaps array.
+/// \param num_overlaps The number of overlaps in the overlaps array
+/// \param check_mask A boolean. If true, only overlaps where select_mask is true
+/// will have their anchor chains calculated.
+/// \return __global__
 __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
                                                    const Anchor* anchors,
                                                    const bool* select_mask,
@@ -75,7 +130,22 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
                                                    int32_t* anchor_chain_starts,
                                                    int32_t num_overlaps,
                                                    bool check_mask);
-
+///
+/// \brief Calculate the anchors chains used to produce each overlap in the
+/// overlap array for anchors chained by RLE.
+///
+/// \param overlaps An array of overlaps.
+/// \param anchors The array of anchors used to generate overlaps.
+/// \param chain_starts An array which holds the indices of the first anchor for
+/// each overlap in the overlaps array.
+/// \param chain_lengths An array which holds the length of each run of anchors, corresponding to
+/// the chain_starts array.
+/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold
+/// the indices of anchors within each chain.
+/// \param anchor_chain_starts An array which holds the indices of the first anchor for
+/// each overlap in the overlaps array.
+/// \param num_overlaps The number of overlaps in the overlaps array.
+/// \return __global__
 __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
                                              const Anchor* anchors,
                                              const int32_t* chain_starts,

From 2a864de10d3f982d669557735b27a8f19c534815 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Thu, 22 Oct 2020 09:50:46 -0400
Subject: [PATCH 08/26] [chainer_utils] Fix minor doc typos.

---
 cudamapper/src/chainer_utils.cuh | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 1ac07973e..80d65b7a4 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -72,17 +72,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
                                               int32_t* predecessors,
                                               const int32_t n_anchors,
                                               const int32_t min_score);
-///
-/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array
-/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than
-/// copy the anchors, the final array holds the indices within the anchors array
-/// of the anchors in the chain.
-///
-/// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field.
-/// \param unrolled_anchor_chains  An array of int32_t. Will be resided on return.
-/// param num_overlaps The number of overlaps in the overlaps array.
-///@param _allocator  The DefaultDeviceAllocator
-///@param _cuda_stream The cudastream to allocate memory within.
 
 ///
 /// \brief Allocate a 1-dimensional array representing an unrolled 2D-array

From 39f7232b73e30bc67a9cbd2a339b1964a2f6966c Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 26 Oct 2020 20:57:57 -0400
Subject: [PATCH 09/26] [docs] Fix typos and doxygen docs in chainer_utils.

---
 cudamapper/src/chainer_utils.cu               |  8 +++
 cudamapper/src/chainer_utils.cuh              | 50 +++++--------------
 .../tests/Test_CudamapperChainerUtils.cu      | 10 ----
 3 files changed, 20 insertions(+), 48 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index d8d6438c1..e5b68258e 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -30,6 +30,14 @@ namespace cudamapper
 namespace chainerutils
 {
 
+struct OverlapToNumResiduesOp
+{
+    __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const
+    {
+        return overlap.num_residues_;
+    }
+};
+
 __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
 {
     Overlap overlap;
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 80d65b7a4..df3ae09b1 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -30,15 +30,6 @@ namespace cudamapper
 namespace chainerutils
 {
 
-struct OverlapToNumResiduesOp
-{
-    __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const
-    {
-        return overlap.num_residues_;
-    }
-};
-
-///
 /// \brief Create an overlap from the first and last anchor in the chain and
 /// the total number of anchors in the chain.
 ///
@@ -49,22 +40,17 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start,
                                                   const Anchor& end,
                                                   const int32_t num_anchors);
 
-///
 /// \brief Perform a backtrace on the predecessors array
 /// to generate overlaps from anchors. Anchors must have been chained
 /// by a chaining function that filles the predecessors and scores array.
 ///
 /// \param anchors An array of anchors.
 /// \param overlaps An array of overlaps to be filled.
-/// \param scores An array of scores. Only chains with a score greater than
-/// min_score will be backtraced.
-/// \param max_select_mask A boolean mask, used to mask out any subchains during
-/// the backtrace.
-/// \param predecessors An array of indices into the anchors array marking the predecessor
-/// of each anchor within a chain.
+/// \param scores An array of scores. Only chains with a score greater than min_score will be backtraced.
+/// \param max_select_mask A boolean mask, used to mask out any subchains during the backtrace.
+/// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain.
 /// \param n_anchors The number of anchors.
 /// \param min_score The minimum score of a chain for performing backtracing.
-/// \return __global__
 __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
                                               Overlap* overlaps,
                                               double* scores,
@@ -73,7 +59,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
                                               const int32_t n_anchors,
                                               const int32_t min_score);
 
-///
 /// \brief Allocate a 1-dimensional array representing an unrolled 2D-array
 /// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than
 /// copy the anchors, the final array holds the indices within the anchors array
@@ -81,8 +66,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
 ///
 /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field
 /// \param unrolled_anchor_chains  An array of int32_t. Will be resided on return.
-/// \param anchor_chain_starts An array holding the index in the anchors array of the first
-/// anchor in an overlap.
+/// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap.
 /// \param num_overlaps The number of overlaps in the overlaps array.
 /// \param num_total_anchors The number of anchors in the anchors array.
 /// \param _allocator The DefaultDeviceAllocator for this overlapper.
@@ -95,7 +79,6 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             DefaultDeviceAllocator& _allocator,
                             cudaStream_t& _cuda_stream);
 
-///
 /// \brief Calculate the anchors chains used to produce each overlap in the
 /// overlap array for anchors chained by backtrace_anchors_to_overlaps.
 ///
@@ -103,14 +86,10 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
 /// \param anchors The array of anchors used to generate overlaps.
 /// \param select_mask An array of bools, used to mask overlaps from output.
 /// \param predecessors The predecessors array from anchor chaining.
-/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold
-/// the indices of anchors within each chain.
-/// \param anchor_chain_starts An array which holds the indices of the first anchor for
-/// each overlap in the overlaps array.
+/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain.
+/// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
 /// \param num_overlaps The number of overlaps in the overlaps array
-/// \param check_mask A boolean. If true, only overlaps where select_mask is true
-/// will have their anchor chains calculated.
-/// \return __global__
+/// \param check_mask A boolean. If true, only overlaps where select_mask is true will have their anchor chains calculated.
 __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
                                                    const Anchor* anchors,
                                                    const bool* select_mask,
@@ -119,22 +98,17 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
                                                    int32_t* anchor_chain_starts,
                                                    int32_t num_overlaps,
                                                    bool check_mask);
-///
+
 /// \brief Calculate the anchors chains used to produce each overlap in the
 /// overlap array for anchors chained by RLE.
 ///
 /// \param overlaps An array of overlaps.
 /// \param anchors The array of anchors used to generate overlaps.
-/// \param chain_starts An array which holds the indices of the first anchor for
-/// each overlap in the overlaps array.
-/// \param chain_lengths An array which holds the length of each run of anchors, corresponding to
-/// the chain_starts array.
-/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold
-/// the indices of anchors within each chain.
-/// \param anchor_chain_starts An array which holds the indices of the first anchor for
-/// each overlap in the overlaps array.
+/// \param chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
+/// \param chain_lengths An array which holds the length of each run of anchors, corresponding to the chain_starts array.
+/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain.
+/// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
 /// \param num_overlaps The number of overlaps in the overlaps array.
-/// \return __global__
 __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
                                              const Anchor* anchors,
                                              const int32_t* chain_starts,
diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index 6348c1602..666f3f459 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -111,9 +111,6 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
     cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data());
     cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data());
 
-    // device_buffer<int32_t> unrolled_anchor_chains(1, allocator, cuda_stream.get());
-    // device_buffer<int32_t> chain_starts(1, allocator, cuda_stream.get());
-
     device_buffer<int32_t> unrolled_anchor_chains;
     device_buffer<int32_t> chain_starts;
 
@@ -125,13 +122,6 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
                                          num_total_anchors,
                                          allocator,
                                          cu_ptr);
-    // void allocate_anchor_chains(device_buffer<Overlap> overlaps,
-    // device_buffer<int32_t>& unrolled_anchor_chains,
-    // device_buffer<int32_t>& anchor_chain_starts,
-    // const int32_t num_overlaps,
-    // int32_t& num_total_anchors,
-    // DefaultDeviceAllocator& _allocator,
-    // cudaStream_t& _cuda_stream);
 }
 
 } // namespace cudamapper

From 8d7d1efc80b35a6fc9b0fb84c34474ddd1229c94 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 26 Oct 2020 21:12:25 -0400
Subject: [PATCH 10/26] Fix compile errors from missing imports

---
 cudaaligner/samples/sample_cudaaligner.cpp   | 1 +
 cudaextender/samples/sample_cudaextender.cpp | 1 +
 cudamapper/samples/sample_cudamapper.cpp     | 1 +
 cudamapper/src/chainer_utils.cu              | 2 +-
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cudaaligner/samples/sample_cudaaligner.cpp b/cudaaligner/samples/sample_cudaaligner.cpp
index db147faab..f236ae2b3 100644
--- a/cudaaligner/samples/sample_cudaaligner.cpp
+++ b/cudaaligner/samples/sample_cudaaligner.cpp
@@ -21,6 +21,7 @@
 #include <claraparabricks/genomeworks/utils/genomeutils.hpp>
 
 #include <cuda_runtime_api.h>
+#include <getopt.h>
 #include <vector>
 #include <string>
 #include <stdexcept>
diff --git a/cudaextender/samples/sample_cudaextender.cpp b/cudaextender/samples/sample_cudaextender.cpp
index 87ad85793..53d4bd8e4 100644
--- a/cudaextender/samples/sample_cudaextender.cpp
+++ b/cudaextender/samples/sample_cudaextender.cpp
@@ -26,6 +26,7 @@
 #include <string>
 #include <vector>
 #include <cuda.h>
+#include <getopt.h>
 
 using namespace claraparabricks::genomeworks;
 using namespace cudautils;
diff --git a/cudamapper/samples/sample_cudamapper.cpp b/cudamapper/samples/sample_cudamapper.cpp
index cfe2d59f9..65151d16d 100644
--- a/cudamapper/samples/sample_cudamapper.cpp
+++ b/cudamapper/samples/sample_cudamapper.cpp
@@ -29,6 +29,7 @@
 #include <iostream>
 #include <string>
 #include <mutex>
+#include <unistd.h>
 
 // define constants. See cudamapper/src/application_parameters.hpp for more.
 // constants used in multiple places
diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index e5b68258e..7066b3aa1 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -127,7 +127,7 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
 
 __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
                                              const Anchor* anchors,
-                                             const int32_t* chain_starts,
+                                             const int32_t* const chain_starts,
                                              const int32_t* chain_lengths,
                                              int32_t* anchor_chains,
                                              int32_t* anchor_chain_starts,

From 756739f749ad8e93788d50377bd09a88d7494826 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 26 Oct 2020 21:28:19 -0400
Subject: [PATCH 11/26] [chainer_utils] Make arguments const / const * const
 where possible.

---
 cudamapper/src/chainer_utils.cu  | 46 ++++++++++++++++----------------
 cudamapper/src/chainer_utils.cuh | 42 ++++++++++++++---------------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 7066b3aa1..3280f6501 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -68,7 +68,7 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anc
 void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            int32_t num_overlaps,
+                            const int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator& _allocator,
                             cudaStream_t& _cuda_stream)
@@ -125,16 +125,16 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                                   _cuda_stream);
 }
 
-__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
-                                             const Anchor* anchors,
+__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
+                                             const Anchor* const anchors,
                                              const int32_t* const chain_starts,
-                                             const int32_t* chain_lengths,
-                                             int32_t* anchor_chains,
-                                             int32_t* anchor_chain_starts,
-                                             int32_t num_overlaps)
+                                             const int32_t* const chain_lengths,
+                                             int32_t* const anchor_chains,
+                                             int32_t* const anchor_chain_starts,
+                                             const int32_t num_overlaps)
 {
-    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int32_t stride      = blockDim.x * gridDim.x;
+    const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride      = blockDim.x * gridDim.x;
     for (int i = d_thread_id; i < num_overlaps; i += stride)
     {
         int32_t chain_start  = chain_starts[i];
@@ -146,17 +146,17 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
     }
 }
 
-__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
-                                                   const Anchor* anchors,
-                                                   const bool* select_mask,
-                                                   const int32_t* predecessors,
-                                                   int32_t* anchor_chains,
-                                                   int32_t* anchor_chain_starts,
-                                                   int32_t num_overlaps,
-                                                   bool check_mask)
+__global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
+                                                   const Anchor* const anchors,
+                                                   const bool* const select_mask,
+                                                   const int32_t* const predecessors,
+                                                   int32_t* const anchor_chains,
+                                                   int32_t* const anchor_chain_starts,
+                                                   const int32_t num_overlaps,
+                                                   const bool check_mask)
 {
-    int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int32_t stride      = blockDim.x * gridDim.x;
+    const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride      = blockDim.x * gridDim.x;
 
     // Processes one overlap per iteration,
     // "i" corresponds to an overlap
@@ -181,11 +181,11 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
     }
 }
 
-__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
+__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               Overlap* overlaps,
-                                              double* scores,
-                                              bool* max_select_mask,
-                                              int32_t* predecessors,
+                                              double* const scores,
+                                              bool* const max_select_mask,
+                                              int32_t* const predecessors,
                                               const int32_t n_anchors,
                                               const int32_t min_score)
 {
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index df3ae09b1..12f75b6de 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -51,11 +51,11 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start,
 /// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain.
 /// \param n_anchors The number of anchors.
 /// \param min_score The minimum score of a chain for performing backtracing.
-__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
-                                              Overlap* overlaps,
-                                              double* scores,
-                                              bool* max_select_mask,
-                                              int32_t* predecessors,
+__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
+                                              Overlap* const overlaps,
+                                              double* const scores,
+                                              bool* const max_select_mask,
+                                              int32_t* const predecessors,
                                               const int32_t n_anchors,
                                               const int32_t min_score);
 
@@ -74,7 +74,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors,
 void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            int32_t num_overlaps,
+                            const int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator& _allocator,
                             cudaStream_t& _cuda_stream);
@@ -90,14 +90,14 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
 /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
 /// \param num_overlaps The number of overlaps in the overlaps array
 /// \param check_mask A boolean. If true, only overlaps where select_mask is true will have their anchor chains calculated.
-__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
-                                                   const Anchor* anchors,
-                                                   const bool* select_mask,
-                                                   const int32_t* predecessors,
-                                                   int32_t* anchor_chains,
-                                                   int32_t* anchor_chain_starts,
-                                                   int32_t num_overlaps,
-                                                   bool check_mask);
+__global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
+                                                   const Anchor* const anchors,
+                                                   const bool* const select_mask,
+                                                   const int32_t* const predecessors,
+                                                   int32_t* const anchor_chains,
+                                                   int32_t* const anchor_chain_starts,
+                                                   const int32_t num_overlaps,
+                                                   const bool check_mask);
 
 /// \brief Calculate the anchors chains used to produce each overlap in the
 /// overlap array for anchors chained by RLE.
@@ -109,13 +109,13 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps,
 /// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain.
 /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
 /// \param num_overlaps The number of overlaps in the overlaps array.
-__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps,
-                                             const Anchor* anchors,
-                                             const int32_t* chain_starts,
-                                             const int32_t* chain_lengths,
-                                             int32_t* anchor_chains,
-                                             int32_t* anchor_chain_starts,
-                                             int32_t num_overlaps);
+__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
+                                             const Anchor* const anchors,
+                                             const int32_t* const chain_starts,
+                                             const int32_t* const chain_lengths,
+                                             int32_t* const anchor_chains,
+                                             int32_t* const anchor_chain_starts,
+                                             const int32_t num_overlaps);
 
 } // namespace chainerutils
 } // namespace cudamapper

From 7680371b0f4708d0b43ab05f2e028ed006f14083 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 26 Oct 2020 21:52:49 -0400
Subject: [PATCH 12/26] [chainer_utils] Pass cudastream and allocation by value
 (not reference) and fix docs issues brought up in review.

---
 cudamapper/src/chainer_utils.cu  | 18 +++++++++---------
 cudamapper/src/chainer_utils.cuh | 17 +++++++++--------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 3280f6501..208538f0f 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -70,24 +70,24 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& anchor_chain_starts,
                             const int32_t num_overlaps,
                             int32_t& num_total_anchors,
-                            DefaultDeviceAllocator& _allocator,
-                            cudaStream_t& _cuda_stream)
+                            DefaultDeviceAllocator allocator,
+                            cudaStream_t cuda_stream)
 {
     // sum the number of chains across all overlaps
-    device_buffer<char> d_temp_buf(_allocator, _cuda_stream);
+    device_buffer<char> d_temp_buf(allocator, cuda_stream);
     void* d_temp_storage           = nullptr;
     std::size_t temp_storage_bytes = 0;
     OverlapToNumResiduesOp overlap_residue_count_op;
     cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, Overlap*> d_residue_counts(overlaps.data(), overlap_residue_count_op);
 
-    device_buffer<int32_t> d_num_total_anchors(1, _allocator, _cuda_stream);
+    device_buffer<int32_t> d_num_total_anchors(1, allocator, cuda_stream);
 
     cub::DeviceReduce::Sum(d_temp_storage,
                            temp_storage_bytes,
                            d_residue_counts,
                            d_num_total_anchors.data(),
                            num_overlaps,
-                           _cuda_stream);
+                           cuda_stream);
 
     d_temp_buf.clear_and_resize(temp_storage_bytes);
     d_temp_storage = d_temp_buf.data();
@@ -97,12 +97,12 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                            d_residue_counts,
                            d_num_total_anchors.data(),
                            num_overlaps,
-                           _cuda_stream);
+                           cuda_stream);
 
     d_temp_storage     = nullptr;
     temp_storage_bytes = 0;
 
-    num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), _cuda_stream);
+    num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), cuda_stream);
 
     unrolled_anchor_chains.clear_and_resize(num_total_anchors);
     anchor_chain_starts.clear_and_resize(num_overlaps);
@@ -112,7 +112,7 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                                   d_residue_counts,
                                   anchor_chain_starts.data(),
                                   num_overlaps,
-                                  _cuda_stream);
+                                  cuda_stream);
 
     d_temp_buf.clear_and_resize(temp_storage_bytes);
     d_temp_storage = d_temp_buf.data();
@@ -122,7 +122,7 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                                   d_residue_counts,
                                   anchor_chain_starts.data(),
                                   num_overlaps,
-                                  _cuda_stream);
+                                  cuda_stream);
 }
 
 __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 12f75b6de..c9c331c2b 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -40,9 +40,10 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start,
                                                   const Anchor& end,
                                                   const int32_t num_anchors);
 
-/// \brief Perform a backtrace on the predecessors array
-/// to generate overlaps from anchors. Anchors must have been chained
-/// by a chaining function that filles the predecessors and scores array.
+/// \brief Produce an array of overlaps by iterating
+/// through the predecessors of each anchor within a chain,
+/// until an anchor with no predecessor is reached. Anchors must have been chained
+/// by a chaining function that fills the predecessors and scores array.
 ///
 /// \param anchors An array of anchors.
 /// \param overlaps An array of overlaps to be filled.
@@ -65,19 +66,19 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
 /// of the anchors in the chain.
 ///
 /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field
-/// \param unrolled_anchor_chains  An array of int32_t. Will be resided on return.
+/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return.
 /// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap.
 /// \param num_overlaps The number of overlaps in the overlaps array.
 /// \param num_total_anchors The number of anchors in the anchors array.
-/// \param _allocator The DefaultDeviceAllocator for this overlapper.
-/// \param _cuda_stream The cudastream to allocate memory within.
+/// \param allocator The DefaultDeviceAllocator for this overlapper.
+/// \param cuda_stream The cudastream to allocate memory within.
 void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
                             const int32_t num_overlaps,
                             int32_t& num_total_anchors,
-                            DefaultDeviceAllocator& _allocator,
-                            cudaStream_t& _cuda_stream);
+                            DefaultDeviceAllocator allocator,
+                            cudaStream_t cuda_stream);
 
 /// \brief Calculate the anchors chains used to produce each overlap in the
 /// overlap array for anchors chained by backtrace_anchors_to_overlaps.

From 78da54195e1d117af4d7399b7fd222304a6b9316 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Tue, 27 Oct 2020 09:48:37 -0400
Subject: [PATCH 13/26] [chainer_utils] Address review comments.

---
 cudamapper/src/chainer_utils.cu               | 123 +++++++++---------
 cudamapper/src/chainer_utils.cuh              |   1 -
 .../tests/Test_CudamapperChainerUtils.cu      |   1 -
 3 files changed, 61 insertions(+), 64 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 208538f0f..b3e39af13 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -65,10 +65,51 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anc
     return overlap;
 }
 
+__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
+                                              Overlap* overlaps,
+                                              double* const scores,
+                                              bool* const max_select_mask,
+                                              int32_t* const predecessors,
+                                              const int32_t n_anchors,
+                                              const int32_t min_score)
+{
+    const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (d_tid < n_anchors)
+    {
+
+        int32_t global_overlap_index = d_tid;
+        if (scores[d_tid] >= min_score)
+        {
+
+            int32_t index                = global_overlap_index;
+            int32_t first_index          = index;
+            int32_t num_anchors_in_chain = 0;
+            Anchor final_anchor          = anchors[global_overlap_index];
+
+            while (index != -1)
+            {
+                first_index  = index;
+                int32_t pred = predecessors[index];
+                if (pred != -1)
+                {
+                    max_select_mask[pred] = false;
+                }
+                num_anchors_in_chain++;
+                index = predecessors[index];
+            }
+            Anchor first_anchor            = anchors[first_index];
+            overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
+        }
+        else
+        {
+            max_select_mask[global_overlap_index] = false;
+        }
+    }
+}
+
 void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            const int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator allocator,
                             cudaStream_t cuda_stream)
@@ -86,7 +127,7 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                            temp_storage_bytes,
                            d_residue_counts,
                            d_num_total_anchors.data(),
-                           num_overlaps,
+                           overlaps.size(),
                            cuda_stream);
 
     d_temp_buf.clear_and_resize(temp_storage_bytes);
@@ -96,7 +137,7 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                            temp_storage_bytes,
                            d_residue_counts,
                            d_num_total_anchors.data(),
-                           num_overlaps,
+                           overlaps.size(),
                            cuda_stream);
 
     d_temp_storage     = nullptr;
@@ -105,13 +146,13 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
     num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), cuda_stream);
 
     unrolled_anchor_chains.clear_and_resize(num_total_anchors);
-    anchor_chain_starts.clear_and_resize(num_overlaps);
+    anchor_chain_starts.clear_and_resize(overlaps.size());
 
     cub::DeviceScan::ExclusiveSum(d_temp_storage,
                                   temp_storage_bytes,
                                   d_residue_counts,
                                   anchor_chain_starts.data(),
-                                  num_overlaps,
+                                  overlaps.size(),
                                   cuda_stream);
 
     d_temp_buf.clear_and_resize(temp_storage_bytes);
@@ -121,31 +162,10 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                                   temp_storage_bytes,
                                   d_residue_counts,
                                   anchor_chain_starts.data(),
-                                  num_overlaps,
+                                  overlaps.size(),
                                   cuda_stream);
 }
 
-__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
-                                             const Anchor* const anchors,
-                                             const int32_t* const chain_starts,
-                                             const int32_t* const chain_lengths,
-                                             int32_t* const anchor_chains,
-                                             int32_t* const anchor_chain_starts,
-                                             const int32_t num_overlaps)
-{
-    const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride      = blockDim.x * gridDim.x;
-    for (int i = d_thread_id; i < num_overlaps; i += stride)
-    {
-        int32_t chain_start  = chain_starts[i];
-        int32_t chain_length = chain_lengths[i];
-        for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i)
-        {
-            anchor_chains[ind] = ind;
-        }
-    }
-}
-
 __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
                                                    const Anchor* const anchors,
                                                    const bool* const select_mask,
@@ -181,44 +201,23 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
     }
 }
 
-__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
-                                              Overlap* overlaps,
-                                              double* const scores,
-                                              bool* const max_select_mask,
-                                              int32_t* const predecessors,
-                                              const int32_t n_anchors,
-                                              const int32_t min_score)
+__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
+                                             const Anchor* const anchors,
+                                             const int32_t* const chain_starts,
+                                             const int32_t* const chain_lengths,
+                                             int32_t* const anchor_chains,
+                                             int32_t* const anchor_chain_starts,
+                                             const int32_t num_overlaps)
 {
-    const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (d_tid < n_anchors)
+    const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride      = blockDim.x * gridDim.x;
+    for (int i = d_thread_id; i < num_overlaps; i += stride)
     {
-
-        int32_t global_overlap_index = d_tid;
-        if (scores[d_tid] >= min_score)
-        {
-
-            int32_t index                = global_overlap_index;
-            int32_t first_index          = index;
-            int32_t num_anchors_in_chain = 0;
-            Anchor final_anchor          = anchors[global_overlap_index];
-
-            while (index != -1)
-            {
-                first_index  = index;
-                int32_t pred = predecessors[index];
-                if (pred != -1)
-                {
-                    max_select_mask[pred] = false;
-                }
-                num_anchors_in_chain++;
-                index = predecessors[index];
-            }
-            Anchor first_anchor            = anchors[first_index];
-            overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
-        }
-        else
+        int32_t chain_start  = chain_starts[i];
+        int32_t chain_length = chain_lengths[i];
+        for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i)
         {
-            max_select_mask[global_overlap_index] = false;
+            anchor_chains[ind] = ind;
         }
     }
 }
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index c9c331c2b..8735a7bbe 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -75,7 +75,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
 void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            const int32_t num_overlaps,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator allocator,
                             cudaStream_t cuda_stream);
diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index 666f3f459..9f1da5914 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -118,7 +118,6 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
     chainerutils::allocate_anchor_chains(d_overlaps,
                                          unrolled_anchor_chains,
                                          chain_starts,
-                                         overlaps.size(),
                                          num_total_anchors,
                                          allocator,
                                          cu_ptr);

From 44f2f4dbecbbaeffe145c0c455fcc0ac2dcb4fad Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Tue, 27 Oct 2020 09:58:28 -0400
Subject: [PATCH 14/26] [chainer_utils] Refactor backtrace_anchors_to_overlaps
 to be a grid-stride function.

---
 cudamapper/src/chainer_utils.cu  | 28 +++++++++++++---------------
 cudamapper/src/chainer_utils.cuh |  6 +++---
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index b3e39af13..c2d23d58d 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -73,18 +73,16 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               const int32_t n_anchors,
                                               const int32_t min_score)
 {
-    const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (d_tid < n_anchors)
+    const std::size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride  = blockDim.x * gridDim.x;
+    for (int i = tid; i < n_anchors; i += stride)
     {
-
-        int32_t global_overlap_index = d_tid;
-        if (scores[d_tid] >= min_score)
+        if (scores[i] >= min_score)
         {
-
-            int32_t index                = global_overlap_index;
+            int32_t index                = i;
             int32_t first_index          = index;
             int32_t num_anchors_in_chain = 0;
-            Anchor final_anchor          = anchors[global_overlap_index];
+            Anchor final_anchor          = anchors[i];
 
             while (index != -1)
             {
@@ -97,17 +95,17 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                 num_anchors_in_chain++;
                 index = predecessors[index];
             }
-            Anchor first_anchor            = anchors[first_index];
-            overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
+            Anchor first_anchor = anchors[first_index];
+            overlaps[i]         = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
         }
         else
         {
-            max_select_mask[global_overlap_index] = false;
+            max_select_mask[i] = false;
         }
     }
 }
 
-void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
+void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
                             int32_t& num_total_anchors,
@@ -119,7 +117,7 @@ void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
     void* d_temp_storage           = nullptr;
     std::size_t temp_storage_bytes = 0;
     OverlapToNumResiduesOp overlap_residue_count_op;
-    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, Overlap*> d_residue_counts(overlaps.data(), overlap_residue_count_op);
+    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, const Overlap*> d_residue_counts(overlaps.data(), overlap_residue_count_op);
 
     device_buffer<int32_t> d_num_total_anchors(1, allocator, cuda_stream);
 
@@ -207,11 +205,11 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
                                              const int32_t* const chain_lengths,
                                              int32_t* const anchor_chains,
                                              int32_t* const anchor_chain_starts,
-                                             const int32_t num_overlaps)
+                                             const uint32_t num_overlaps)
 {
     const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     const int32_t stride      = blockDim.x * gridDim.x;
-    for (int i = d_thread_id; i < num_overlaps; i += stride)
+    for (uint32_t i = d_thread_id; i < num_overlaps; i += stride)
     {
         int32_t chain_start  = chain_starts[i];
         int32_t chain_length = chain_lengths[i];
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 8735a7bbe..12ad64fe2 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -72,12 +72,12 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
 /// \param num_total_anchors The number of anchors in the anchors array.
 /// \param allocator The DefaultDeviceAllocator for this overlapper.
 /// \param cuda_stream The cudastream to allocate memory within.
-void allocate_anchor_chains(device_buffer<Overlap>& overlaps,
+void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
                             int32_t& num_total_anchors,
                             DefaultDeviceAllocator allocator,
-                            cudaStream_t cuda_stream);
+                            cudaStream_t cuda_stream = 0);
 
 /// \brief Calculate the anchors chains used to produce each overlap in the
 /// overlap array for anchors chained by backtrace_anchors_to_overlaps.
@@ -115,7 +115,7 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
                                              const int32_t* const chain_lengths,
                                              int32_t* const anchor_chains,
                                              int32_t* const anchor_chain_starts,
-                                             const int32_t num_overlaps);
+                                             const uint32_t num_overlaps);
 
 } // namespace chainerutils
 } // namespace cudamapper

From be3b0a4483bc7857ca5ffccbc19a3bc5737f1d2f Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Tue, 27 Oct 2020 23:34:26 -0400
Subject: [PATCH 15/26] [chainer_utils] Fix bugs in tests caused by not
 specifying cudastream when instantiating device arrays.

---
 cudamapper/tests/Test_CudamapperChainerUtils.cu | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index 9f1da5914..560e94432 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -69,8 +69,7 @@ TEST(TestChainerUtils, Create_Simple_Overlap_Tests)
 
 TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
 {
-
-    DefaultDeviceAllocator allocator = create_default_device_allocator(2048);
+    DefaultDeviceAllocator allocator = create_default_device_allocator();
 
     CudaStream cuda_stream = make_cuda_stream();
     auto cu_ptr            = cuda_stream.get();
@@ -106,15 +105,15 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
     overlaps.push_back(p_ab);
     overlaps.push_back(p_bc);
 
-    device_buffer<Anchor> d_anchors(anchors.size(), allocator, cuda_stream.get());
-    device_buffer<Overlap> d_overlaps(overlaps.size(), allocator, cuda_stream.get());
-    cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data());
-    cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data());
-
-    device_buffer<int32_t> unrolled_anchor_chains;
-    device_buffer<int32_t> chain_starts;
+    device_buffer<Anchor> d_anchors(anchors.size(), allocator, cu_ptr);
+    device_buffer<Overlap> d_overlaps(overlaps.size(), allocator, cu_ptr);
+    cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data(), cu_ptr);
+    cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data(), cu_ptr);
 
+    device_buffer<int32_t> unrolled_anchor_chains(0, allocator, cu_ptr);
+    device_buffer<int32_t> chain_starts(0, allocator, cu_ptr);
     int32_t num_total_anchors;
+
     chainerutils::allocate_anchor_chains(d_overlaps,
                                          unrolled_anchor_chains,
                                          chain_starts,

From 9f75411a491f79a6a5dda56eb4cf68a95910042b Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Wed, 28 Oct 2020 10:26:12 -0400
Subject: [PATCH 16/26] [chainer_utils] Refactor device memory from d_*
 convention to *_d.

---
 cudamapper/src/chainer_utils.cu               | 52 +++++++++----------
 .../tests/Test_CudamapperChainerUtils.cu      | 10 ++--
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index c2d23d58d..fa956a861 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -113,52 +113,52 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             cudaStream_t cuda_stream)
 {
     // sum the number of chains across all overlaps
-    device_buffer<char> d_temp_buf(allocator, cuda_stream);
-    void* d_temp_storage           = nullptr;
+    device_buffer<char> temp_buf_d(allocator, cuda_stream);
+    void* temp_storage_d           = nullptr;
     std::size_t temp_storage_bytes = 0;
     OverlapToNumResiduesOp overlap_residue_count_op;
-    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, const Overlap*> d_residue_counts(overlaps.data(), overlap_residue_count_op);
+    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, const Overlap*> residue_counts_d(overlaps.data(), overlap_residue_count_op);
 
-    device_buffer<int32_t> d_num_total_anchors(1, allocator, cuda_stream);
+    device_buffer<int32_t> num_total_anchors_d(1, allocator, cuda_stream);
 
-    cub::DeviceReduce::Sum(d_temp_storage,
+    cub::DeviceReduce::Sum(temp_storage_d,
                            temp_storage_bytes,
-                           d_residue_counts,
-                           d_num_total_anchors.data(),
+                           residue_counts_d,
+                           num_total_anchors_d.data(),
                            overlaps.size(),
                            cuda_stream);
 
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
+    temp_buf_d.clear_and_resize(temp_storage_bytes);
+    temp_storage_d = temp_buf_d.data();
 
-    cub::DeviceReduce::Sum(d_temp_storage,
+    cub::DeviceReduce::Sum(temp_storage_d,
                            temp_storage_bytes,
-                           d_residue_counts,
-                           d_num_total_anchors.data(),
+                           residue_counts_d,
+                           num_total_anchors_d.data(),
                            overlaps.size(),
                            cuda_stream);
 
-    d_temp_storage     = nullptr;
+    temp_storage_d     = nullptr;
     temp_storage_bytes = 0;
 
-    num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), cuda_stream);
+    num_total_anchors = cudautils::get_value_from_device(num_total_anchors_d.data(), cuda_stream);
 
     unrolled_anchor_chains.clear_and_resize(num_total_anchors);
     anchor_chain_starts.clear_and_resize(overlaps.size());
 
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+    cub::DeviceScan::ExclusiveSum(temp_storage_d,
                                   temp_storage_bytes,
-                                  d_residue_counts,
+                                  residue_counts_d,
                                   anchor_chain_starts.data(),
                                   overlaps.size(),
                                   cuda_stream);
 
-    d_temp_buf.clear_and_resize(temp_storage_bytes);
-    d_temp_storage = d_temp_buf.data();
+    temp_buf_d.clear_and_resize(temp_storage_bytes);
+    temp_storage_d = temp_buf_d.data();
 
-    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+    cub::DeviceScan::ExclusiveSum(temp_storage_d,
                                   temp_storage_bytes,
-                                  d_residue_counts,
+                                  residue_counts_d,
                                   anchor_chain_starts.data(),
                                   overlaps.size(),
                                   cuda_stream);
@@ -173,12 +173,12 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
                                                    const int32_t num_overlaps,
                                                    const bool check_mask)
 {
-    const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride      = blockDim.x * gridDim.x;
+    const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride    = blockDim.x * gridDim.x;
 
     // Processes one overlap per iteration,
     // "i" corresponds to an overlap
-    for (int i = d_thread_id; i < num_overlaps; i += stride)
+    for (int i = thread_id; i < num_overlaps; i += stride)
     {
         // index within this chain of anchors (i.e., the anchors within a single overlap)
 
@@ -207,9 +207,9 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
                                              int32_t* const anchor_chain_starts,
                                              const uint32_t num_overlaps)
 {
-    const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride      = blockDim.x * gridDim.x;
-    for (uint32_t i = d_thread_id; i < num_overlaps; i += stride)
+    const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride    = blockDim.x * gridDim.x;
+    for (uint32_t i = thread_id; i < num_overlaps; i += stride)
     {
         int32_t chain_start  = chain_starts[i];
         int32_t chain_length = chain_lengths[i];
diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index 560e94432..609bdd79a 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -105,16 +105,16 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
     overlaps.push_back(p_ab);
     overlaps.push_back(p_bc);
 
-    device_buffer<Anchor> d_anchors(anchors.size(), allocator, cu_ptr);
-    device_buffer<Overlap> d_overlaps(overlaps.size(), allocator, cu_ptr);
-    cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data(), cu_ptr);
-    cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data(), cu_ptr);
+    device_buffer<Anchor> anchors_d(anchors.size(), allocator, cu_ptr);
+    device_buffer<Overlap> overlaps_d(overlaps.size(), allocator, cu_ptr);
+    cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), cu_ptr);
+    cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), cu_ptr);
 
     device_buffer<int32_t> unrolled_anchor_chains(0, allocator, cu_ptr);
     device_buffer<int32_t> chain_starts(0, allocator, cu_ptr);
     int32_t num_total_anchors;
 
-    chainerutils::allocate_anchor_chains(d_overlaps,
+    chainerutils::allocate_anchor_chains(overlaps_d,
                                          unrolled_anchor_chains,
                                          chain_starts,
                                          num_total_anchors,

From c72e5c966bae243de66f04e1cc4690d41e09963e Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 2 Nov 2020 10:26:57 -0500
Subject: [PATCH 17/26] [chainerutils] Rename create_simple_overlap to
 create_overlap

---
 cudamapper/src/chainer_utils.cu                 |  4 ++--
 cudamapper/src/chainer_utils.cuh                |  2 +-
 cudamapper/tests/Test_CudamapperChainerUtils.cu | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index fa956a861..89875591d 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -38,7 +38,7 @@ struct OverlapToNumResiduesOp
     }
 };
 
-__host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
+__host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
 {
     Overlap overlap;
     overlap.num_residues_ = num_anchors;
@@ -96,7 +96,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                 index = predecessors[index];
             }
             Anchor first_anchor = anchors[first_index];
-            overlaps[i]         = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain);
+            overlaps[i]         = create_overlap(first_anchor, final_anchor, num_anchors_in_chain);
         }
         else
         {
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 12ad64fe2..bf8752da3 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -36,7 +36,7 @@ namespace chainerutils
 /// \param start The first anchor in the chain.
 /// \param end The last anchor in the chain.
 /// \param num_anchors The total number of anchors in the chain.
-__host__ __device__ Overlap create_simple_overlap(const Anchor& start,
+__host__ __device__ Overlap create_overlap(const Anchor& start,
                                                   const Anchor& end,
                                                   const int32_t num_anchors);
 
diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index 609bdd79a..3638dd92b 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -32,7 +32,7 @@ namespace genomeworks
 namespace cudamapper
 {
 
-TEST(TestChainerUtils, Create_Simple_Overlap_Tests)
+TEST(TestChainerUtils, Create_Overlap_Tests)
 {
     Anchor a;
     a.query_read_id_           = 12;
@@ -46,7 +46,7 @@ TEST(TestChainerUtils, Create_Simple_Overlap_Tests)
     b.target_read_id_          = 46;
     b.target_position_in_read_ = 20400;
 
-    Overlap p_ab = chainerutils::create_simple_overlap(a, b, 12);
+    Overlap p_ab = chainerutils::create_overlap(a, b, 12);
     ASSERT_EQ(p_ab.query_read_id_, static_cast<uint32_t>(12));
     ASSERT_EQ(p_ab.target_read_id_, 46);
     ASSERT_EQ(p_ab.query_start_position_in_read_, 130);
@@ -61,7 +61,7 @@ TEST(TestChainerUtils, Create_Simple_Overlap_Tests)
     c.target_read_id_          = 46;
     c.target_position_in_read_ = 16000;
 
-    Overlap p_bc = chainerutils::create_simple_overlap(b, c, 22);
+    Overlap p_bc = chainerutils::create_overlap(b, c, 22);
     ASSERT_EQ(p_bc.target_start_position_in_read_, 16000);
     ASSERT_EQ(p_bc.target_end_position_in_read_, 20400);
     ASSERT_EQ(static_cast<char>(p_bc.relative_strand), static_cast<char>(RelativeStrand::Reverse));
@@ -86,7 +86,7 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
     b.target_read_id_          = 46;
     b.target_position_in_read_ = 20400;
 
-    Overlap p_ab = chainerutils::create_simple_overlap(a, b, 2);
+    Overlap p_ab = chainerutils::create_overlap(a, b, 2);
 
     Anchor c;
     c.query_read_id_           = 12;
@@ -94,7 +94,7 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
     c.target_read_id_          = 46;
     c.target_position_in_read_ = 16000;
 
-    Overlap p_bc = chainerutils::create_simple_overlap(b, c, 2);
+    Overlap p_bc = chainerutils::create_overlap(b, c, 2);
 
     std::vector<Anchor> anchors;
     anchors.push_back(a);

From 6df3b764931864fc0a979202137471118f26d5b6 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 2 Nov 2020 10:32:00 -0500
Subject: [PATCH 18/26] [chainerutils] Move declaration of temporary memory
 used by CUB.

---
 cudamapper/src/chainer_utils.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 89875591d..0cdb13ad7 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -113,11 +113,12 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             cudaStream_t cuda_stream)
 {
     // sum the number of chains across all overlaps
-    device_buffer<char> temp_buf_d(allocator, cuda_stream);
+
     void* temp_storage_d           = nullptr;
     std::size_t temp_storage_bytes = 0;
     OverlapToNumResiduesOp overlap_residue_count_op;
-    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, const Overlap*> residue_counts_d(overlaps.data(), overlap_residue_count_op);
+    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, const Overlap*> residue_counts_d(overlaps.data(),
+                                                                                                  overlap_residue_count_op);
 
     device_buffer<int32_t> num_total_anchors_d(1, allocator, cuda_stream);
 
@@ -128,7 +129,7 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                            overlaps.size(),
                            cuda_stream);
 
-    temp_buf_d.clear_and_resize(temp_storage_bytes);
+    device_buffer<char> temp_buf_d(temp_storage_bytes, allocator, cuda_stream);
     temp_storage_d = temp_buf_d.data();
 
     cub::DeviceReduce::Sum(temp_storage_d,

From 781c2fd201ed96a4027a929fb6ed3b73d95cb3ce Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 2 Nov 2020 10:52:44 -0500
Subject: [PATCH 19/26] [chainerutils] Use 64bit ints for the number of
 anchors.

---
 cudamapper/src/chainer_utils.cu                 | 6 +++---
 cudamapper/src/chainer_utils.cuh                | 8 ++++----
 cudamapper/tests/Test_CudamapperChainerUtils.cu | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 0cdb13ad7..bb7231877 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -70,7 +70,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               double* const scores,
                                               bool* const max_select_mask,
                                               int32_t* const predecessors,
-                                              const int32_t n_anchors,
+                                              const int64_t n_anchors,
                                               const int32_t min_score)
 {
     const std::size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -108,7 +108,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
 void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            int32_t& num_total_anchors,
+                            int64_t& num_total_anchors,
                             DefaultDeviceAllocator allocator,
                             cudaStream_t cuda_stream)
 {
@@ -120,7 +120,7 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
     cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, const Overlap*> residue_counts_d(overlaps.data(),
                                                                                                   overlap_residue_count_op);
 
-    device_buffer<int32_t> num_total_anchors_d(1, allocator, cuda_stream);
+    device_buffer<int64_t> num_total_anchors_d(1, allocator, cuda_stream);
 
     cub::DeviceReduce::Sum(temp_storage_d,
                            temp_storage_bytes,
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index bf8752da3..14af4b21a 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -37,8 +37,8 @@ namespace chainerutils
 /// \param end The last anchor in the chain.
 /// \param num_anchors The total number of anchors in the chain.
 __host__ __device__ Overlap create_overlap(const Anchor& start,
-                                                  const Anchor& end,
-                                                  const int32_t num_anchors);
+                                           const Anchor& end,
+                                           const int32_t num_anchors);
 
 /// \brief Produce an array of overlaps by iterating
 /// through the predecessors of each anchor within a chain,
@@ -57,7 +57,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               double* const scores,
                                               bool* const max_select_mask,
                                               int32_t* const predecessors,
-                                              const int32_t n_anchors,
+                                              const int64_t n_anchors,
                                               const int32_t min_score);
 
 /// \brief Allocate a 1-dimensional array representing an unrolled 2D-array
@@ -75,7 +75,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
 void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             device_buffer<int32_t>& unrolled_anchor_chains,
                             device_buffer<int32_t>& anchor_chain_starts,
-                            int32_t& num_total_anchors,
+                            int64_t& num_total_anchors,
                             DefaultDeviceAllocator allocator,
                             cudaStream_t cuda_stream = 0);
 
diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index 3638dd92b..af95802dc 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -112,7 +112,7 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
 
     device_buffer<int32_t> unrolled_anchor_chains(0, allocator, cu_ptr);
     device_buffer<int32_t> chain_starts(0, allocator, cu_ptr);
-    int32_t num_total_anchors;
+    int64_t num_total_anchors;
 
     chainerutils::allocate_anchor_chains(overlaps_d,
                                          unrolled_anchor_chains,

From bd07138da853b3aa9b5f84c94093f4401e38b38a Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 2 Nov 2020 12:07:02 -0500
Subject: [PATCH 20/26] [chainerutils] Attempt to clean up documentation.

---
 cudamapper/src/chainer_utils.cu  |  5 +++--
 cudamapper/src/chainer_utils.cuh | 22 ++++++++++------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index bb7231877..d6bd962c9 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -73,8 +73,9 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               const int64_t n_anchors,
                                               const int32_t min_score)
 {
-    const std::size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride  = blockDim.x * gridDim.x;
+    const int64_t tid    = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride = blockDim.x * gridDim.x;
+
     for (int i = tid; i < n_anchors; i += stride)
     {
         if (scores[i] >= min_score)
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 14af4b21a..f7488eb78 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -40,10 +40,8 @@ __host__ __device__ Overlap create_overlap(const Anchor& start,
                                            const Anchor& end,
                                            const int32_t num_anchors);
 
-/// \brief Produce an array of overlaps by iterating
-/// through the predecessors of each anchor within a chain,
-/// until an anchor with no predecessor is reached. Anchors must have been chained
-/// by a chaining function that fills the predecessors and scores array.
+/// \brief Given an array of anchors and an array denoting the immediate
+/// predecessor of each anchor, transform chains of anchors into overlaps.
 ///
 /// \param anchors An array of anchors.
 /// \param overlaps An array of overlaps to be filled.
@@ -60,13 +58,12 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               const int64_t n_anchors,
                                               const int32_t min_score);
 
-/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array
-/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than
-/// copy the anchors, the final array holds the indices within the anchors array
+/// \brief Allocate a 1-dimensional array representing an unrolled ragged array
+/// of anchors within each overlap. The final array holds the indices within the anchors array
 /// of the anchors in the chain.
 ///
 /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field
-/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return.
+/// \param unrolled_anchor_chains An array of int32_t. Will be resized on return.
 /// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap.
 /// \param num_overlaps The number of overlaps in the overlaps array.
 /// \param num_total_anchors The number of anchors in the anchors array.
@@ -79,8 +76,10 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             DefaultDeviceAllocator allocator,
                             cudaStream_t cuda_stream = 0);
 
-/// \brief Calculate the anchors chains used to produce each overlap in the
-/// overlap array for anchors chained by backtrace_anchors_to_overlaps.
+/// \brief Given an array of overlaps, fill a 1D unrolled ragged array
+/// containing the anchors used to generate each overlap. Anchors must have
+/// been chained with a chaining function that fills the predecessors array
+/// with the immediate predecessor of each anchor.
 ///
 /// \param overlaps An array of overlaps.
 /// \param anchors The array of anchors used to generate overlaps.
@@ -99,8 +98,7 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
                                                    const int32_t num_overlaps,
                                                    const bool check_mask);
 
-/// \brief Calculate the anchors chains used to produce each overlap in the
-/// overlap array for anchors chained by RLE.
+/// \brief Fill a 1D unrolled ragged array with the anchors used to produce each overlap.
 ///
 /// \param overlaps An array of overlaps.
 /// \param anchors The array of anchors used to generate overlaps.

From 178b075fb37f65df4ad67877c835503d6efcee03 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 2 Nov 2020 19:16:23 -0500
Subject: [PATCH 21/26] Remove unnecessary assignment in chainerutils.

---
 cudamapper/src/chainer_utils.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index d6bd962c9..1f68d325d 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -193,8 +193,7 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
             while (index != -1)
             {
                 anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index)] = index;
-                int32_t pred                                                                             = predecessors[index];
-                index                                                                                    = pred;
+                index                                                                                    = predecessors[index];
                 ++anchor_chain_index;
             }
         }

From 6244636dab64dc9dc845b8c743642cc9ab9f6cc1 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 2 Nov 2020 20:39:59 -0500
Subject: [PATCH 22/26] [chainerutils] Refactor allocate_anchor_chains to use
 thrust, rather than CUB.

---
 cudamapper/src/chainer_utils.cu               | 67 ++++++-------------
 cudamapper/src/overlapper_triggered.cu        |  5 --
 .../tests/Test_CudamapperChainerUtils.cu      |  9 ++-
 3 files changed, 28 insertions(+), 53 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 1f68d325d..5d6c135bf 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -17,6 +17,9 @@
 #include "chainer_utils.cuh"
 
 #include <cub/cub.cuh>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/reduce.h>
 #include <claraparabricks/genomeworks/utils/cudautils.hpp>
 
 namespace claraparabricks
@@ -38,6 +41,14 @@ struct OverlapToNumResiduesOp
     }
 };
 
+struct ConvertOverlapToNumResidues : public thrust::unary_function<Overlap, int32_t>
+{
+    __host__ __device__ int32_t operator()(const Overlap& o) const
+    {
+        return o.num_residues_;
+    }
+};
+
 __host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors)
 {
     Overlap overlap;
@@ -113,57 +124,19 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             DefaultDeviceAllocator allocator,
                             cudaStream_t cuda_stream)
 {
-    // sum the number of chains across all overlaps
-
-    void* temp_storage_d           = nullptr;
-    std::size_t temp_storage_bytes = 0;
-    OverlapToNumResiduesOp overlap_residue_count_op;
-    cub::TransformInputIterator<int32_t, OverlapToNumResiduesOp, const Overlap*> residue_counts_d(overlaps.data(),
-                                                                                                  overlap_residue_count_op);
-
-    device_buffer<int64_t> num_total_anchors_d(1, allocator, cuda_stream);
-
-    cub::DeviceReduce::Sum(temp_storage_d,
-                           temp_storage_bytes,
-                           residue_counts_d,
-                           num_total_anchors_d.data(),
-                           overlaps.size(),
-                           cuda_stream);
-
-    device_buffer<char> temp_buf_d(temp_storage_bytes, allocator, cuda_stream);
-    temp_storage_d = temp_buf_d.data();
-
-    cub::DeviceReduce::Sum(temp_storage_d,
-                           temp_storage_bytes,
-                           residue_counts_d,
-                           num_total_anchors_d.data(),
-                           overlaps.size(),
-                           cuda_stream);
-
-    temp_storage_d     = nullptr;
-    temp_storage_bytes = 0;
-
-    num_total_anchors = cudautils::get_value_from_device(num_total_anchors_d.data(), cuda_stream);
+    auto thrust_exec_policy = thrust::cuda::par(allocator).on(cuda_stream);
+    num_total_anchors       = thrust::reduce(thrust_exec_policy,
+                                       thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()),
+                                       thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()),
+                                       0);
 
     unrolled_anchor_chains.clear_and_resize(num_total_anchors);
     anchor_chain_starts.clear_and_resize(overlaps.size());
 
-    cub::DeviceScan::ExclusiveSum(temp_storage_d,
-                                  temp_storage_bytes,
-                                  residue_counts_d,
-                                  anchor_chain_starts.data(),
-                                  overlaps.size(),
-                                  cuda_stream);
-
-    temp_buf_d.clear_and_resize(temp_storage_bytes);
-    temp_storage_d = temp_buf_d.data();
-
-    cub::DeviceScan::ExclusiveSum(temp_storage_d,
-                                  temp_storage_bytes,
-                                  residue_counts_d,
-                                  anchor_chain_starts.data(),
-                                  overlaps.size(),
-                                  cuda_stream);
+    thrust::exclusive_scan(thrust_exec_policy,
+                           thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()),
+                           thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()),
+                           anchor_chain_starts.data());
 }
 
 __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
diff --git a/cudamapper/src/overlapper_triggered.cu b/cudamapper/src/overlapper_triggered.cu
index 331ebb936..68ed98160 100644
--- a/cudamapper/src/overlapper_triggered.cu
+++ b/cudamapper/src/overlapper_triggered.cu
@@ -24,11 +24,6 @@
 
 #include <claraparabricks/genomeworks/utils/cudautils.hpp>
 
-#ifndef NDEBUG // only needed to check if input is sorted in assert
-#include <algorithm>
-#include <thrust/host_vector.h>
-#endif
-
 namespace claraparabricks
 {
 
diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index af95802dc..7ae64c78b 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -67,7 +67,7 @@ TEST(TestChainerUtils, Create_Overlap_Tests)
     ASSERT_EQ(static_cast<char>(p_bc.relative_strand), static_cast<char>(RelativeStrand::Reverse));
 }
 
-TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
+TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests)
 {
     DefaultDeviceAllocator allocator = create_default_device_allocator();
 
@@ -120,6 +120,13 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests)
                                          num_total_anchors,
                                          allocator,
                                          cu_ptr);
+
+    ASSERT_EQ(num_total_anchors, 4);
+
+    std::vector<int32_t> anchors_chain_starts_h(overlaps.size());
+    cudautils::device_copy_n(chain_starts.data(), num_total_anchors, anchors_chain_starts_h.data(), cu_ptr);
+    ASSERT_EQ(anchors_chain_starts_h[0], 0);
+    ASSERT_EQ(anchors_chain_starts_h[1], 2);
 }
 
 } // namespace cudamapper

From d933b4c2707400ef04ca8eadf66162a147b56123 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 2 Nov 2020 21:14:21 -0500
Subject: [PATCH 23/26] [chainerutils] Use a gride-stride function with the
 block as the unit to process backtraced overlaps.

---
 cudamapper/src/chainer_utils.cu | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 5d6c135bf..342325831 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -84,11 +84,16 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               const int64_t n_anchors,
                                               const int32_t min_score)
 {
-    const int64_t tid    = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride = blockDim.x * gridDim.x;
+    const int64_t block_id = blockIdx.x;
+    const int32_t stride   = gridDim.x;
 
-    for (int i = tid; i < n_anchors; i += stride)
+    for (int i = block_id; i < n_anchors; i += stride)
     {
+
+        if (threadIdx.x != 0)
+        {
+            return;
+        }
         if (scores[i] >= min_score)
         {
             int32_t index                = i;
@@ -148,14 +153,17 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
                                                    const int32_t num_overlaps,
                                                    const bool check_mask)
 {
-    const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride    = blockDim.x * gridDim.x;
+    const int32_t block_id = blockIdx.x;
+    const int32_t stride   = gridDim.x;
 
     // Processes one overlap per iteration,
     // "i" corresponds to an overlap
-    for (int i = thread_id; i < num_overlaps; i += stride)
+    for (int i = block_id; i < num_overlaps; i += stride)
     {
-        // index within this chain of anchors (i.e., the anchors within a single overlap)
+        if (threadIdx.x != 0)
+        {
+            return;
+        }
 
         if (!check_mask || (check_mask & select_mask[i]))
         {

From b3102926c6559ca5dc85dfdd5c8d0aa68ace7339 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Tue, 3 Nov 2020 08:33:40 -0500
Subject: [PATCH 24/26] [chainerutils] Convert single-block functions back to
 single-thread.

---
 cudamapper/src/chainer_utils.cu | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index 342325831..d4c714e45 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -84,16 +84,11 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               const int64_t n_anchors,
                                               const int32_t min_score)
 {
-    const int64_t block_id = blockIdx.x;
-    const int32_t stride   = gridDim.x;
+    const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride    = blockDim.x * gridDim.x;
 
-    for (int i = block_id; i < n_anchors; i += stride)
+    for (int i = thread_id; i < n_anchors; i += stride)
     {
-
-        if (threadIdx.x != 0)
-        {
-            return;
-        }
         if (scores[i] >= min_score)
         {
             int32_t index                = i;
@@ -153,17 +148,13 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
                                                    const int32_t num_overlaps,
                                                    const bool check_mask)
 {
-    const int32_t block_id = blockIdx.x;
-    const int32_t stride   = gridDim.x;
+    const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const int32_t stride    = blockDim.x * gridDim.x;
 
     // Processes one overlap per iteration,
     // "i" corresponds to an overlap
-    for (int i = block_id; i < num_overlaps; i += stride)
+    for (int i = thread_id; i < num_overlaps; i += stride)
     {
-        if (threadIdx.x != 0)
-        {
-            return;
-        }
 
         if (!check_mask || (check_mask & select_mask[i]))
         {

From 405ec7e4e0b090a3323da5970c038a219078c5bc Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Fri, 6 Nov 2020 10:29:48 -0500
Subject: [PATCH 25/26] [cudamapper] Fix issues from PR review.

- Use thrust transform_reduce and transform_exclusive_scan (rather than make_transform iterator) whe>
- Constify pointers where possible.
- Remove unnecessary includes.
- Tweak docs for backtrace_anchors_to_overlap
---
 cudaaligner/samples/sample_cudaaligner.cpp |  1 -
 cudamapper/samples/sample_cudamapper.cpp   |  1 -
 cudamapper/src/chainer_utils.cu            | 46 +++++++++++-----------
 cudamapper/src/chainer_utils.cuh           |  8 ++--
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/cudaaligner/samples/sample_cudaaligner.cpp b/cudaaligner/samples/sample_cudaaligner.cpp
index 50f62dc6a..f236ae2b3 100644
--- a/cudaaligner/samples/sample_cudaaligner.cpp
+++ b/cudaaligner/samples/sample_cudaaligner.cpp
@@ -27,7 +27,6 @@
 #include <stdexcept>
 #include <iostream>
 #include <random>
-#include <getopt.h>
 
 using namespace claraparabricks::genomeworks;
 using namespace claraparabricks::genomeworks::genomeutils;
diff --git a/cudamapper/samples/sample_cudamapper.cpp b/cudamapper/samples/sample_cudamapper.cpp
index f31a38829..142439d4b 100644
--- a/cudamapper/samples/sample_cudamapper.cpp
+++ b/cudamapper/samples/sample_cudamapper.cpp
@@ -29,7 +29,6 @@
 #include <iostream>
 #include <string>
 #include <mutex>
-#include <unistd.h>
 #include <getopt.h>
 
 // define constants. See cudamapper/src/application_parameters.hpp for more.
diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index d4c714e45..c116667cf 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -16,9 +16,8 @@
 
 #include "chainer_utils.cuh"
 
-#include <cub/cub.cuh>
 #include <thrust/execution_policy.h>
-#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform_scan.h>
 #include <thrust/reduce.h>
 #include <claraparabricks/genomeworks/utils/cudautils.hpp>
 
@@ -33,14 +32,6 @@ namespace cudamapper
 namespace chainerutils
 {
 
-struct OverlapToNumResiduesOp
-{
-    __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const
-    {
-        return overlap.num_residues_;
-    }
-};
-
 struct ConvertOverlapToNumResidues : public thrust::unary_function<Overlap, int32_t>
 {
     __host__ __device__ int32_t operator()(const Overlap& o) const
@@ -77,12 +68,12 @@ __host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& en
 }
 
 __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
-                                              Overlap* overlaps,
-                                              double* const scores,
+                                              Overlap* const overlaps,
+                                              const double* const scores,
                                               bool* const max_select_mask,
-                                              int32_t* const predecessors,
+                                              const int32_t* const predecessors,
                                               const int64_t n_anchors,
-                                              const int32_t min_score)
+                                              const double min_score)
 {
     const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     const int32_t stride    = blockDim.x * gridDim.x;
@@ -125,18 +116,29 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
                             cudaStream_t cuda_stream)
 {
     auto thrust_exec_policy = thrust::cuda::par(allocator).on(cuda_stream);
-    num_total_anchors       = thrust::reduce(thrust_exec_policy,
-                                       thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()),
-                                       thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()),
-                                       0);
+    thrust::plus<int32_t> sum_op;
+    num_total_anchors = thrust::transform_reduce(thrust_exec_policy,
+                                                 overlaps.begin(),
+                                                 overlaps.end(),
+                                                 ConvertOverlapToNumResidues(),
+                                                 0,
+                                                 sum_op);
 
     unrolled_anchor_chains.clear_and_resize(num_total_anchors);
     anchor_chain_starts.clear_and_resize(overlaps.size());
 
-    thrust::exclusive_scan(thrust_exec_policy,
-                           thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()),
-                           thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()),
-                           anchor_chain_starts.data());
+    // thrust::exclusive_scan(thrust_exec_policy,
+    //                        thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()),
+    //                        thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()),
+    //                        anchor_chain_starts.data());
+
+    thrust::transform_exclusive_scan(thrust_exec_policy,
+                                     overlaps.begin(),
+                                     overlaps.end(),
+                                     anchor_chain_starts.data(),
+                                     ConvertOverlapToNumResidues(),
+                                     0,
+                                     sum_op);
 }
 
 __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index f7488eb78..50e184541 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -46,17 +46,17 @@ __host__ __device__ Overlap create_overlap(const Anchor& start,
 /// \param anchors An array of anchors.
 /// \param overlaps An array of overlaps to be filled.
 /// \param scores An array of scores. Only chains with a score greater than min_score will be backtraced.
-/// \param max_select_mask A boolean mask, used to mask out any subchains during the backtrace.
+/// \param max_select_mask A boolean mask, used to mask out any chains which are completely contained within larger chains during the backtrace.
 /// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain.
 /// \param n_anchors The number of anchors.
 /// \param min_score The minimum score of a chain for performing backtracing.
 __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               Overlap* const overlaps,
-                                              double* const scores,
+                                              const double* const scores,
                                               bool* const max_select_mask,
-                                              int32_t* const predecessors,
+                                              const int32_t* const predecessors,
                                               const int64_t n_anchors,
-                                              const int32_t min_score);
+                                              const double min_score);
 
 /// \brief Allocate a 1-dimensional array representing an unrolled ragged array
 /// of anchors within each overlap. The final array holds the indices within the anchors array

From 530adfd65e913d03fe7d6e1709786d91a9b4b608 Mon Sep 17 00:00:00 2001
From: "Eric T. Dawson" <eric.t.dawson@gmail.com>
Date: Mon, 9 Nov 2020 21:57:35 -0500
Subject: [PATCH 26/26] [chainerutils] Address review comments.

Primarily, this adds a number of tests for output_overlap_chains_by_RLE,
output_overlap_chains_by_backtrace, and backtrace_anchors_to_overlaps.
It also fixes a bug where the last anchor position in a chain was not returned
from backtrace_anchors_to_overlaps, causing faulty output when later extracting anchor
chains.
---
 cudamapper/src/chainer_utils.cu               |  39 ++-
 cudamapper/src/chainer_utils.cuh              |  18 +-
 cudamapper/src/overlapper_triggered.cu        |   5 +
 .../tests/Test_CudamapperChainerUtils.cu      | 278 ++++++++++++++++--
 4 files changed, 299 insertions(+), 41 deletions(-)

diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu
index c116667cf..af4490652 100644
--- a/cudamapper/src/chainer_utils.cu
+++ b/cudamapper/src/chainer_utils.cu
@@ -32,6 +32,15 @@ namespace cudamapper
 namespace chainerutils
 {
 
+__device__ __forceinline__ Anchor empty_anchor()
+{
+    Anchor a;
+    a.query_read_id_           = UINT32_MAX;
+    a.query_position_in_read_  = UINT32_MAX;
+    a.target_read_id_          = UINT32_MAX;
+    a.target_position_in_read_ = UINT32_MAX;
+    return a;
+}
 struct ConvertOverlapToNumResidues : public thrust::unary_function<Overlap, int32_t>
 {
     __host__ __device__ int32_t operator()(const Overlap& o) const
@@ -69,11 +78,12 @@ __host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& en
 
 __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               Overlap* const overlaps,
-                                              const double* const scores,
+                                              int32_t* const overlap_terminal_anchors,
+                                              const float* const scores,
                                               bool* const max_select_mask,
                                               const int32_t* const predecessors,
                                               const int64_t n_anchors,
-                                              const double min_score)
+                                              const float min_score)
 {
     const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     const int32_t stride    = blockDim.x * gridDim.x;
@@ -98,12 +108,15 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                 num_anchors_in_chain++;
                 index = predecessors[index];
             }
-            Anchor first_anchor = anchors[first_index];
-            overlaps[i]         = create_overlap(first_anchor, final_anchor, num_anchors_in_chain);
+            Anchor first_anchor         = anchors[first_index];
+            overlap_terminal_anchors[i] = i;
+            overlaps[i]                 = create_overlap(first_anchor, final_anchor, num_anchors_in_chain);
         }
         else
         {
-            max_select_mask[i] = false;
+            max_select_mask[i]          = false;
+            overlap_terminal_anchors[i] = -1;
+            overlaps[i]                 = create_overlap(empty_anchor(), empty_anchor(), 1);
         }
     }
 }
@@ -127,11 +140,6 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
     unrolled_anchor_chains.clear_and_resize(num_total_anchors);
     anchor_chain_starts.clear_and_resize(overlaps.size());
 
-    // thrust::exclusive_scan(thrust_exec_policy,
-    //                        thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()),
-    //                        thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()),
-    //                        anchor_chain_starts.data());
-
     thrust::transform_exclusive_scan(thrust_exec_policy,
                                      overlaps.begin(),
                                      overlaps.end(),
@@ -145,6 +153,7 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
                                                    const Anchor* const anchors,
                                                    const bool* const select_mask,
                                                    const int32_t* const predecessors,
+                                                   const int32_t* const chain_terminators,
                                                    int32_t* const anchor_chains,
                                                    int32_t* const anchor_chain_starts,
                                                    const int32_t num_overlaps,
@@ -163,11 +172,11 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
             int32_t anchor_chain_index = 0;
             // As chaining proceeds backwards (i.e., it's a backtrace),
             // we need to fill the new anchor chain array in in reverse order.
-            int32_t index = anchor_chain_starts[i];
+            int32_t index = chain_terminators[i];
             while (index != -1)
             {
-                anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index)] = index;
-                index                                                                                    = predecessors[index];
+                anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index - 1)] = index;
+                index                                                                                        = predecessors[index];
                 ++anchor_chain_index;
             }
         }
@@ -188,9 +197,9 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
     {
         int32_t chain_start  = chain_starts[i];
         int32_t chain_length = chain_lengths[i];
-        for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i)
+        for (int32_t index = chain_start; index < chain_start + chain_length; ++index)
         {
-            anchor_chains[ind] = ind;
+            anchor_chains[index] = index;
         }
     }
 }
diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh
index 50e184541..d564315fe 100644
--- a/cudamapper/src/chainer_utils.cuh
+++ b/cudamapper/src/chainer_utils.cuh
@@ -50,23 +50,26 @@ __host__ __device__ Overlap create_overlap(const Anchor& start,
 /// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain.
 /// \param n_anchors The number of anchors.
 /// \param min_score The minimum score of a chain for performing backtracing.
+/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors were allocated.
+/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...)
 __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors,
                                               Overlap* const overlaps,
-                                              const double* const scores,
+                                              int32_t* const overlap_terminal_anchors,
+                                              const float* const scores,
                                               bool* const max_select_mask,
                                               const int32_t* const predecessors,
                                               const int64_t n_anchors,
-                                              const double min_score);
+                                              const float min_score);
 
 /// \brief Allocate a 1-dimensional array representing an unrolled ragged array
-/// of anchors within each overlap. The final array holds the indices within the anchors array
+/// of anchors within each overlap. The final array holds the indices within the anchors array used in chaining
 /// of the anchors in the chain.
 ///
 /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field
 /// \param unrolled_anchor_chains An array of int32_t. Will be resized on return.
-/// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap.
+/// \param anchor_chain_starts An array holding the index of the first anchor in an overlap within the anchors array used during chaining.
 /// \param num_overlaps The number of overlaps in the overlaps array.
-/// \param num_total_anchors The number of anchors in the anchors array.
+/// \param num_total_anchors The total number of anchors across all overlaps.
 /// \param allocator The DefaultDeviceAllocator for this overlapper.
 /// \param cuda_stream The cudastream to allocate memory within.
 void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
@@ -89,10 +92,13 @@ void allocate_anchor_chains(const device_buffer<Overlap>& overlaps,
 /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
 /// \param num_overlaps The number of overlaps in the overlaps array
 /// \param check_mask A boolean. If true, only overlaps where select_mask is true will have their anchor chains calculated.
+/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors/chains/starts were allocated.
+/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...)
 __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps,
                                                    const Anchor* const anchors,
                                                    const bool* const select_mask,
                                                    const int32_t* const predecessors,
+                                                   const int32_t* const chain_terminators,
                                                    int32_t* const anchor_chains,
                                                    int32_t* const anchor_chain_starts,
                                                    const int32_t num_overlaps,
@@ -107,6 +113,8 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps
 /// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain.
 /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array.
 /// \param num_overlaps The number of overlaps in the overlaps array.
+/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors/chains/starts were allocated.
+/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...)
 __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps,
                                              const Anchor* const anchors,
                                              const int32_t* const chain_starts,
diff --git a/cudamapper/src/overlapper_triggered.cu b/cudamapper/src/overlapper_triggered.cu
index 68ed98160..d55f63fd0 100644
--- a/cudamapper/src/overlapper_triggered.cu
+++ b/cudamapper/src/overlapper_triggered.cu
@@ -24,6 +24,11 @@
 
 #include <claraparabricks/genomeworks/utils/cudautils.hpp>
 
+#include <algorithm>
+
+#ifndef NDEBUG
+#include <thrust/host_vector.h>
+#endif
 namespace claraparabricks
 {
 
diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu
index 7ae64c78b..71447e17f 100644
--- a/cudamapper/tests/Test_CudamapperChainerUtils.cu
+++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu
@@ -16,8 +16,6 @@
 
 #include "gtest/gtest.h"
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
 #include <vector>
 
 #include <claraparabricks/genomeworks/cudamapper/types.hpp>
@@ -53,7 +51,9 @@ TEST(TestChainerUtils, Create_Overlap_Tests)
     ASSERT_EQ(p_ab.query_end_position_in_read_, 10000);
     ASSERT_EQ(p_ab.target_start_position_in_read_, 10320);
     ASSERT_EQ(p_ab.num_residues_, 12);
-    ASSERT_EQ(static_cast<char>(p_ab.relative_strand), static_cast<char>(RelativeStrand::Forward));
+    // ASSERT_EQ(static_cast<char>(p_ab.relative_strand), static_cast<char>(RelativeStrand::Forward));
+
+    ASSERT_EQ(p_ab.relative_strand, RelativeStrand::Forward);
 
     Anchor c;
     c.query_read_id_           = 12;
@@ -64,15 +64,92 @@ TEST(TestChainerUtils, Create_Overlap_Tests)
     Overlap p_bc = chainerutils::create_overlap(b, c, 22);
     ASSERT_EQ(p_bc.target_start_position_in_read_, 16000);
     ASSERT_EQ(p_bc.target_end_position_in_read_, 20400);
-    ASSERT_EQ(static_cast<char>(p_bc.relative_strand), static_cast<char>(RelativeStrand::Reverse));
+    ASSERT_EQ(p_bc.relative_strand, RelativeStrand::Reverse);
+}
+
+TEST(TestChainerUtils, Anchor_Backtrace_Tests)
+{
+    Anchor a;
+    a.query_read_id_           = 12;
+    a.query_position_in_read_  = 130;
+    a.target_read_id_          = 46;
+    a.target_position_in_read_ = 10320;
+
+    Anchor b;
+    b.query_read_id_           = 12;
+    b.query_position_in_read_  = 10000;
+    b.target_read_id_          = 46;
+    b.target_position_in_read_ = 20400;
+
+    Anchor c;
+    c.query_read_id_           = 12;
+    c.query_position_in_read_  = 15000;
+    c.target_read_id_          = 46;
+    c.target_position_in_read_ = 1000;
+
+    std::vector<Anchor> anchors       = {a, b, c};
+    std::vector<float> scores         = {10, 20, 25};
+    std::vector<int32_t> predecessors = {-1, 0, 0};
+
+    DefaultDeviceAllocator allocator = create_default_device_allocator();
+    CudaStream backtrace_cuda_stream = make_cuda_stream();
+    auto backtrace_cu_stream_ptr     = backtrace_cuda_stream.get();
+
+    device_buffer<Anchor> anchors_d(anchors.size(), allocator, backtrace_cu_stream_ptr);
+    device_buffer<Overlap> overlaps_d(anchors.size(), allocator, backtrace_cu_stream_ptr);
+    device_buffer<float> scores_d(anchors.size(), allocator, backtrace_cu_stream_ptr);
+    device_buffer<int32_t> predecessors_d(anchors.size(), allocator, backtrace_cu_stream_ptr);
+    device_buffer<int32_t> overlap_terminal_anchors(anchors.size(), allocator, backtrace_cu_stream_ptr);
+    device_buffer<bool> mask_d(anchors.size(), allocator, backtrace_cu_stream_ptr);
+
+    cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), backtrace_cu_stream_ptr);
+    cudautils::device_copy_n(scores.data(), scores.size(), scores_d.data(), backtrace_cu_stream_ptr);
+    cudautils::device_copy_n(predecessors.data(), predecessors.size(), predecessors_d.data(), backtrace_cu_stream_ptr);
+
+    chainerutils::backtrace_anchors_to_overlaps<<<64, 32, 0, backtrace_cu_stream_ptr>>>(anchors_d.data(),
+                                                                                        overlaps_d.data(),
+                                                                                        overlap_terminal_anchors.data(),
+                                                                                        scores_d.data(),
+                                                                                        mask_d.data(),
+                                                                                        predecessors_d.data(),
+                                                                                        anchors_d.size(),
+                                                                                        20);
+
+    std::vector<Overlap> overlaps;
+    overlaps.resize(overlaps_d.size());
+    cudautils::device_copy_n(overlaps_d.data(), overlaps_d.size(), overlaps.data(), backtrace_cu_stream_ptr);
+    std::vector<int32_t> terminals_h;
+    terminals_h.resize(overlaps.size());
+    cudautils::device_copy_n(overlap_terminal_anchors.data(), overlap_terminal_anchors.size(), terminals_h.data(), backtrace_cu_stream_ptr);
+    ASSERT_EQ(overlaps_d.size(), 3);
+
+    ASSERT_EQ(overlaps[0].num_residues_, 1);
+    ASSERT_EQ(overlaps[0].target_read_id_, UINT32_MAX);
+
+    ASSERT_EQ(overlaps[1].num_residues_, 2);
+    ASSERT_EQ(overlaps[1].query_read_id_, 12);
+    ASSERT_EQ(overlaps[1].query_start_position_in_read_, 130);
+    ASSERT_EQ(overlaps[1].target_start_position_in_read_, 10320);
+    ASSERT_EQ(overlaps[1].query_end_position_in_read_, 10000);
+    ASSERT_EQ(overlaps[1].target_end_position_in_read_, 20400);
+    ASSERT_EQ(overlaps[1].relative_strand, RelativeStrand::Forward);
+
+    ASSERT_EQ(overlaps[2].num_residues_, 2);
+    ASSERT_EQ(overlaps[2].query_read_id_, 12);
+    ASSERT_EQ(overlaps[2].query_end_position_in_read_, 15000);
+    ASSERT_EQ(overlaps[2].relative_strand, RelativeStrand::Reverse);
+
+    ASSERT_EQ(terminals_h[0], -1);
+    ASSERT_EQ(terminals_h[1], 1);
+    ASSERT_EQ(terminals_h[2], 2);
 }
 
 TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests)
 {
     DefaultDeviceAllocator allocator = create_default_device_allocator();
 
-    CudaStream cuda_stream = make_cuda_stream();
-    auto cu_ptr            = cuda_stream.get();
+    CudaStream chain_alloc_cuda_stream = make_cuda_stream();
+    auto chain_alloc_cu_stream_ptr     = chain_alloc_cuda_stream.get();
 
     Anchor a;
     a.query_read_id_           = 12;
@@ -96,22 +173,17 @@ TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests)
 
     Overlap p_bc = chainerutils::create_overlap(b, c, 2);
 
-    std::vector<Anchor> anchors;
-    anchors.push_back(a);
-    anchors.push_back(b);
-    anchors.push_back(c);
+    std::vector<Anchor> anchors = {a, b, c};
 
-    std::vector<Overlap> overlaps;
-    overlaps.push_back(p_ab);
-    overlaps.push_back(p_bc);
+    std::vector<Overlap> overlaps = {p_ab, p_bc};
 
-    device_buffer<Anchor> anchors_d(anchors.size(), allocator, cu_ptr);
-    device_buffer<Overlap> overlaps_d(overlaps.size(), allocator, cu_ptr);
-    cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), cu_ptr);
-    cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), cu_ptr);
+    device_buffer<Anchor> anchors_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<Overlap> overlaps_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), chain_alloc_cu_stream_ptr);
 
-    device_buffer<int32_t> unrolled_anchor_chains(0, allocator, cu_ptr);
-    device_buffer<int32_t> chain_starts(0, allocator, cu_ptr);
+    device_buffer<int32_t> unrolled_anchor_chains(0, allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<int32_t> chain_starts(0, allocator, chain_alloc_cu_stream_ptr);
     int64_t num_total_anchors;
 
     chainerutils::allocate_anchor_chains(overlaps_d,
@@ -119,14 +191,178 @@ TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests)
                                          chain_starts,
                                          num_total_anchors,
                                          allocator,
-                                         cu_ptr);
+                                         chain_alloc_cu_stream_ptr);
 
     ASSERT_EQ(num_total_anchors, 4);
 
     std::vector<int32_t> anchors_chain_starts_h(overlaps.size());
-    cudautils::device_copy_n(chain_starts.data(), num_total_anchors, anchors_chain_starts_h.data(), cu_ptr);
+    cudautils::device_copy_n(chain_starts.data(), num_total_anchors, anchors_chain_starts_h.data(), chain_alloc_cu_stream_ptr);
     ASSERT_EQ(anchors_chain_starts_h[0], 0);
     ASSERT_EQ(anchors_chain_starts_h[1], 2);
+    ASSERT_EQ(num_total_anchors, 4);
+}
+
+TEST(TestChainerUtils, Test_Output_Overlap_Chains_By_RLE)
+{
+    DefaultDeviceAllocator allocator = create_default_device_allocator();
+
+    CudaStream chain_alloc_cuda_stream = make_cuda_stream();
+    auto chain_alloc_cu_stream_ptr     = chain_alloc_cuda_stream.get();
+
+    Anchor a;
+    a.query_read_id_           = 12;
+    a.query_position_in_read_  = 130;
+    a.target_read_id_          = 46;
+    a.target_position_in_read_ = 10320;
+
+    Anchor b;
+    b.query_read_id_           = 12;
+    b.query_position_in_read_  = 10000;
+    b.target_read_id_          = 46;
+    b.target_position_in_read_ = 20400;
+
+    Anchor c;
+    c.query_read_id_           = 12;
+    c.query_position_in_read_  = 15000;
+    c.target_read_id_          = 46;
+    c.target_position_in_read_ = 1000;
+
+    Anchor d;
+    d.query_read_id_           = 12;
+    d.query_position_in_read_  = 20000;
+    d.target_read_id_          = 46;
+    d.target_position_in_read_ = 6000;
+
+    std::vector<Anchor> anchors = {a, b, c, d};
+
+    std::vector<int32_t> run_starts_h  = {0, 2};
+    std::vector<int32_t> run_lengths_h = {2, 2};
+    device_buffer<int32_t> run_starts_d(run_starts_h.size(), allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<int32_t> run_lengths_d(run_starts_h.size(), allocator, chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(run_starts_h.data(), run_starts_h.size(), run_starts_d.data(), chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(run_lengths_h.data(), run_lengths_h.size(), run_lengths_d.data(), chain_alloc_cu_stream_ptr);
+
+    Overlap p_ab                  = chainerutils::create_overlap(a, b, 2);
+    Overlap p_cd                  = chainerutils::create_overlap(c, d, 2);
+    std::vector<Overlap> overlaps = {p_ab, p_cd};
+    device_buffer<Overlap> overlaps_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<Anchor> anchors_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), chain_alloc_cu_stream_ptr);
+
+    device_buffer<int32_t> chain_starts(0, allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<int32_t> unrolled_anchor_chains(0, allocator, chain_alloc_cu_stream_ptr);
+    int64_t num_total_anchors = 0;
+    chainerutils::allocate_anchor_chains(overlaps_d,
+                                         unrolled_anchor_chains,
+                                         chain_starts,
+                                         num_total_anchors,
+                                         allocator,
+                                         chain_alloc_cu_stream_ptr);
+
+    chainerutils::output_overlap_chains_by_RLE<<<1024, 64, 0, chain_alloc_cu_stream_ptr>>>(overlaps_d.data(),
+                                                                                           anchors_d.data(),
+                                                                                           run_starts_d.data(),
+                                                                                           run_lengths_d.data(),
+                                                                                           unrolled_anchor_chains.data(),
+                                                                                           chain_starts.data(),
+                                                                                           overlaps.size());
+    std::vector<int32_t> chain_starts_h;
+    chain_starts_h.resize(chain_starts.size());
+    cudautils::device_copy_n(chain_starts.data(), chain_starts.size(), chain_starts_h.data(), chain_alloc_cu_stream_ptr);
+    std::vector<int32_t> chains_h;
+    chains_h.resize(unrolled_anchor_chains.size());
+    cudautils::device_copy_n(unrolled_anchor_chains.data(), unrolled_anchor_chains.size(), chains_h.data(), chain_alloc_cu_stream_ptr);
+    ASSERT_EQ(unrolled_anchor_chains.size(), 4);
+    ASSERT_EQ(chain_starts.size(), 2);
+    ASSERT_EQ(chain_starts_h[0], 0);
+    ASSERT_EQ(chain_starts_h[1], 2);
+    ASSERT_EQ(chains_h[0], 0);
+    ASSERT_EQ(chains_h[2], 2);
+}
+
+TEST(TestChainerUtils, Test_Output_Overlap_Chains_By_Backtrace)
+{
+
+    DefaultDeviceAllocator allocator = create_default_device_allocator();
+
+    CudaStream chain_alloc_cuda_stream = make_cuda_stream();
+    auto chain_alloc_cu_stream_ptr     = chain_alloc_cuda_stream.get();
+
+    Anchor a;
+    a.query_read_id_           = 12;
+    a.query_position_in_read_  = 130;
+    a.target_read_id_          = 46;
+    a.target_position_in_read_ = 10320;
+
+    Anchor b;
+    b.query_read_id_           = 12;
+    b.query_position_in_read_  = 10000;
+    b.target_read_id_          = 46;
+    b.target_position_in_read_ = 20400;
+
+    Anchor c;
+    c.query_read_id_           = 12;
+    c.query_position_in_read_  = 15000;
+    c.target_read_id_          = 46;
+    c.target_position_in_read_ = 1000;
+
+    std::vector<Anchor> anchors                   = {a, b, c};
+    std::vector<float> scores                     = {10, 20, 25};
+    std::vector<int32_t> predecessors             = {-1, 0, 0};
+    std::vector<int32_t> overlap_terminal_anchors = {1, 2};
+
+    Overlap p_ab                  = chainerutils::create_overlap(a, b, 2);
+    Overlap p_ac                  = chainerutils::create_overlap(a, c, 2);
+    std::vector<Overlap> overlaps = {p_ab, p_ac};
+
+    device_buffer<Anchor> anchors_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<Overlap> overlaps_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<bool> mask_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<int32_t> pred_d(predecessors.size(), allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<int32_t> terminals_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr);
+
+    cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(predecessors.data(), predecessors.size(), pred_d.data(), chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(overlap_terminal_anchors.data(), overlap_terminal_anchors.size(), terminals_d.data(), chain_alloc_cu_stream_ptr);
+
+    device_buffer<int32_t> unrolled_anchor_chains(0, allocator, chain_alloc_cu_stream_ptr);
+    device_buffer<int32_t> chain_starts(0, allocator, chain_alloc_cu_stream_ptr);
+    int64_t num_total_anchors;
+
+    chainerutils::allocate_anchor_chains(overlaps_d,
+                                         unrolled_anchor_chains,
+                                         chain_starts,
+                                         num_total_anchors,
+                                         allocator,
+                                         chain_alloc_cu_stream_ptr);
+
+    chainerutils::output_overlap_chains_by_backtrace<<<1024, 64, 0, chain_alloc_cu_stream_ptr>>>(overlaps_d.data(),
+                                                                                                 anchors_d.data(),
+                                                                                                 mask_d.data(),
+                                                                                                 pred_d.data(),
+                                                                                                 terminals_d.data(),
+                                                                                                 unrolled_anchor_chains.data(),
+                                                                                                 chain_starts.data(),
+                                                                                                 overlaps.size(),
+                                                                                                 false);
+
+    std::vector<int32_t> unrolled_chains_h;
+    unrolled_chains_h.resize(unrolled_anchor_chains.size());
+    std::vector<int32_t> chain_starts_h;
+    chain_starts_h.resize(chain_starts.size());
+    cudautils::device_copy_n(unrolled_anchor_chains.data(), unrolled_anchor_chains.size(), unrolled_chains_h.data(), chain_alloc_cu_stream_ptr);
+    cudautils::device_copy_n(chain_starts.data(), chain_starts.size(), chain_starts_h.data(), chain_alloc_cu_stream_ptr);
+
+    ASSERT_EQ(chain_starts_h.size(), 2);
+    ASSERT_EQ(chain_starts_h[0], 0);
+    ASSERT_EQ(chain_starts_h[1], 2);
+    ASSERT_EQ(unrolled_chains_h[0], 0);
+    ASSERT_EQ(unrolled_chains_h[1], 1);
+    ASSERT_EQ(unrolled_chains_h[2], 0);
+    ASSERT_EQ(unrolled_chains_h[3], 2);
+    ASSERT_EQ(unrolled_anchor_chains.size(), 4);
 }
 
 } // namespace cudamapper