From 35889627e8644c42d9d50c146b0357b63e3f7de4 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 19 Oct 2020 12:54:18 -0400 Subject: [PATCH 01/26] [cudamapper] Add anchor chain output functions to chainer_utils.cuh. Adds two functions required for retrieving the anchors within an overlap. --- cudamapper/CMakeLists.txt | 1 + cudamapper/src/chainer_utils.cu | 534 +++++++++++++++++++++++++++++++ cudamapper/src/chainer_utils.cuh | 183 +++++++++++ 3 files changed, 718 insertions(+) create mode 100644 cudamapper/src/chainer_utils.cu create mode 100644 cudamapper/src/chainer_utils.cuh diff --git a/cudamapper/CMakeLists.txt b/cudamapper/CMakeLists.txt index 5b6d3be99..00b1c2054 100644 --- a/cudamapper/CMakeLists.txt +++ b/cudamapper/CMakeLists.txt @@ -46,6 +46,7 @@ endif() cuda_add_library(${MODULE_NAME} src/application_parameters.cpp + src/chainer_utils.cu src/cudamapper.cpp src/index_batcher.cu src/index_descriptor.cpp diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu new file mode 100644 index 000000000..97dde816d --- /dev/null +++ b/cudamapper/src/chainer_utils.cu @@ -0,0 +1,534 @@ +/* +* Copyright 2020 NVIDIA CORPORATION. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "chainer_utils.cuh" + +#include +#include +#include + +// Needed for accumulate - remove when ported to cuda +#include +#include + +#include +#include + +#include + +namespace claraparabricks +{ + +namespace genomeworks +{ + +namespace cudamapper +{ +namespace chainerutils +{ + +__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b) +{ + return a.query_read_id_ == b.query_read_id_ && a.target_read_id_ == b.target_read_id_; +} + +__device__ bool operator==(const QueryReadID& a, const QueryReadID& b) +{ + return a.query_read_id_ == b.query_read_id_; +} + +__device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) +{ + Overlap overlap; + overlap.num_residues_ = num_anchors; + + overlap.query_read_id_ = start.query_read_id_; + overlap.target_read_id_ = start.target_read_id_; + assert(start.query_read_id_ == end.query_read_id_ && start.target_read_id_ == end.target_read_id_); + + overlap.query_start_position_in_read_ = min(start.query_position_in_read_, end.query_position_in_read_); + overlap.query_end_position_in_read_ = max(start.query_position_in_read_, end.query_position_in_read_); + bool is_negative_strand = end.target_position_in_read_ < start.target_position_in_read_; + if (is_negative_strand) + { + overlap.relative_strand = RelativeStrand::Reverse; + overlap.target_start_position_in_read_ = end.target_position_in_read_; + overlap.target_end_position_in_read_ = start.target_position_in_read_; + } + else + { + overlap.relative_strand = RelativeStrand::Forward; + overlap.target_start_position_in_read_ = start.target_position_in_read_; + overlap.target_end_position_in_read_ = end.target_position_in_read_; + } + return overlap; +} + +void allocate_anchor_chains(device_buffer overlaps, + device_buffer& unrolled_anchor_chains, + const int32_t num_overlaps, + int32_t& num_total_anchors, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream) +{ + // sum the number of chains across all overlaps + device_buffer d_temp_buf(_allocator, _cuda_stream); + void* d_temp_storage = nullptr; + std::size_t temp_storage_bytes = 0; + OverlapToNumResiduesOp overlap_residue_count_op; + cub::TransformInputIterator d_residue_counts(overlaps.data(), overlap_residue_count_op); + + device_buffer d_num_total_anchors(1, _allocator, _cuda_stream); + device_buffer d_chain_starts(num_overlaps, _allocator, _cuda_stream); // Index of the first anchor in the anchors array for an overlap. + device_buffer d_chain_lengths(num_overlaps, _allocator, _cuda_stream); // Equivalent to num_residues + + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_chain_lengths.data(), + d_chain_starts.data(), + num_overlaps, + _cuda_stream); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_chain_lengths.data(), + d_chain_starts.data(), + num_overlaps, + _cuda_stream); + + int32_t last_start = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream); + int32_t last_length = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream); + num_total_anchors = last_start + last_length; + + unrolled_anchor_chains.clear_and_resize(num_total_anchors); +} + +__global__ void output_overlap_chains(const Overlap* overlaps, + const Anchor* anchors, + const bool* select_mask, + const int32_t* predecessors, + int32_t* anchor_chains, + int32_t* anchor_chain_starts, + int32_t num_overlaps, + bool check_mask) +{ + int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int32_t stride = blockDim.x * gridDim.x; + + // Processes one overlap per iteration, + // "i" corresponds to an overlap + for (int i = d_thread_id; i < num_overlaps; i += stride) + { + // index within this chain of anchors (i.e., the anchors within a single overlap) + + if (!check_mask || (check_mask & select_mask[i])) + { + int32_t anchor_chain_index = 0; + // As chaining proceeds backwards (i.e., it's a backtrace), + // we need to fill the new anchor chain array in in reverse order. + int32_t index = anchor_chain_starts[i]; + while (index != -1) + { + anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index)] = index; + int32_t pred = predecessors[index]; + index = pred; + ++anchor_chain_index; + } + } + } +} + +__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, + Overlap* overlaps, + double* scores, + bool* max_select_mask, + int32_t* predecessors, + const int32_t n_anchors, + const int32_t min_score) +{ + const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x; + if (d_tid < n_anchors) + { + + int32_t global_overlap_index = d_tid; + if (scores[d_tid] >= min_score) + { + + int32_t index = global_overlap_index; + int32_t first_index = index; + int32_t num_anchors_in_chain = 0; + Anchor final_anchor = anchors[global_overlap_index]; + + while (index != -1) + { + first_index = index; + int32_t pred = predecessors[index]; + if (pred != -1) + { + max_select_mask[pred] = false; + } + num_anchors_in_chain++; + index = predecessors[index]; + } + Anchor first_anchor = anchors[first_index]; + overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain); + // Overlap final_overlap = overlaps[global_overlap_index]; + // printf("%d %d %d %d %d %d %d %f\n", + // final_overlap.query_read_id_, final_overlap.query_start_position_in_read_, final_overlap.query_end_position_in_read_, + // final_overlap.target_read_id_, final_overlap.target_start_position_in_read_, final_overlap.target_end_position_in_read_, + // final_overlap.num_residues_, + // final_score); + } + else + { + max_select_mask[global_overlap_index] = false; + } + } +} + +__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts) +{ + std::int32_t d_tid = blockIdx.x * blockDim.x + threadIdx.x; + if (d_tid < n_starts) + { + ends[d_tid] = starts[d_tid] + lengths[d_tid] - 1; + } +} + +// we take each query that is encoded as length (ie there are 4 queries that have the same id) +// so we take the (how many queries are in that section) and divide it by the size of the tile to get the +// number of tiles for that query +__global__ void calculate_tiles_per_read(const std::int32_t* lengths, + const int32_t num_reads, + const int32_t tile_size, + std::int32_t* tiles_per_read) +{ + int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + if (d_thread_id < num_reads) + { + // if the number of queries in that section are not evenly divisible, we need an extra block + // to accomadate for the leftovers + int32_t n_integer_blocks = lengths[d_thread_id] / tile_size; + int32_t remainder = lengths[d_thread_id] % tile_size; + tiles_per_read[d_thread_id] = remainder == 0 ? n_integer_blocks : n_integer_blocks + 1; + } +} + +// TODO VI: threads may be overwriting each other. +// because the number of queries is greater than number of tiles, each thread here has to write the query starts +// for multiple tiles +__global__ void calculate_tile_starts(const std::int32_t* query_starts, + const std::int32_t* tiles_per_query, + std::int32_t* tile_starts, + const int32_t tile_size, + const int32_t num_queries, + const std::int32_t* tiles_per_query_up_to_point) +{ + int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int32_t stride = blockDim.x * gridDim.x; + if (d_thread_id < num_queries) + { + // for each tile, we look up the query it corresponds to and offset it by the which tile in the query + // we're at multiplied by the total size of the tile + // TODO VI: this memory access pattern seems a little off? Thread i and thread i+1 would overwrite eachother, no? + for (int i = 0; i < tiles_per_query[d_thread_id]; i++) + { + //finds the offset in the ragged array and offsets to find start of "next" sub array + tile_starts[tiles_per_query_up_to_point[d_thread_id] + i] = query_starts[d_thread_id] + (i * tile_size); + } + } + //int counter = 0; + //for (int i =0; i < num_queries; i++) + //{ + // for (int j = 0; j < tiles_per_query[i]; j++) + // { + // tile_starts[counter] = query_starts[i] + (j * tile_size); + // counter++; + // } + //} +} + +void encode_anchor_query_locations(const Anchor* anchors, + int32_t n_anchors, + int32_t tile_size, + device_buffer& query_starts, + device_buffer& query_lengths, + device_buffer& query_ends, + device_buffer& tiles_per_query, + device_buffer& tile_starts, + int32_t& n_queries, + int32_t& n_query_tiles, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream, + int32_t block_size) +{ + AnchorToQueryReadIDOp anchor_to_read_op; + // This takes anchors and outputs and converts the anchors to QueryReadID types (references) + cub::TransformInputIterator d_queries(anchors, anchor_to_read_op); + // create buffer of size number of anchors + device_buffer d_query_read_ids(n_anchors, _allocator, _cuda_stream); + // vector of number of read ids...? I don't know what this is for + // This is used to store the length of the encoded sequence + device_buffer d_num_query_read_ids(1, _allocator, _cuda_stream); + + // don't know what this is for yet + // this is for internal use for run length encoding + device_buffer d_temp_buf(_allocator, _cuda_stream); + void* d_temp_storage = nullptr; + std::size_t temp_storage_bytes = 0; + + cub::DeviceRunLengthEncode::Encode(d_temp_storage, + temp_storage_bytes, + d_queries, + d_query_read_ids.data(), + query_lengths.data(), + d_num_query_read_ids.data(), + n_anchors); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceRunLengthEncode::Encode(d_temp_storage, + temp_storage_bytes, + d_queries, + d_query_read_ids.data(), + query_lengths.data(), // this is the vector of encoded lengths + d_num_query_read_ids.data(), + n_anchors); + // this is just the "length" of the encoded sequence + n_queries = cudautils::get_value_from_device(d_num_query_read_ids.data(), _cuda_stream); + d_temp_storage = nullptr; + temp_storage_bytes = 0; + + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + query_lengths.data(), // this is the vector of encoded lengths + query_starts.data(), // at this point, this vector is empty + n_queries, + _cuda_stream); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + // this gives us the number of queries up to that point. eg How many query starts we have at each index + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + query_lengths.data(), // this is the vector of encoded lengths + query_starts.data(), + n_queries, + _cuda_stream); + + // paper uses the ends and finds the beginnings with x - w + 1, are we converting to that here? + // TODO VI: I'm not entirely sure what this is for? I think we want to change the read query + // (defined by [query_start, query_start + query_length] to [query_end - query_length + 1, query_end]) + // The above () is NOT true + convert_offsets_to_ends<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), // this gives how many starts at each index + query_lengths.data(), // this is the vector of encoded lengths + query_ends.data(), + n_queries); + + // what case is this? A: This always runs because TILE_SIZE is fixed at 1024 + if (tile_size > 0) + { + calculate_tiles_per_read<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_lengths.data(), n_queries, tile_size, tiles_per_query.data()); + // This stores the total number of tiles that we need, calculated with the below Sum() + device_buffer d_n_query_tiles(1, _allocator, _cuda_stream); + + d_temp_storage = nullptr; + temp_storage_bytes = 0; + cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries); + + // this is the length of tile_starts + n_query_tiles = cudautils::get_value_from_device(d_n_query_tiles.data(), _cuda_stream); + + // This is used to calculate the offsets for tile_starts + device_buffer d_tiles_per_query_up_to_point(n_queries, _allocator, _cuda_stream); + + d_temp_storage = nullptr; + temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + tiles_per_query.data(), // this is the vector of encoded lengths + d_tiles_per_query_up_to_point.data(), + n_queries, + _cuda_stream); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + tiles_per_query.data(), // this is the vector of encoded lengths + d_tiles_per_query_up_to_point.data(), + n_queries, + _cuda_stream); + + calculate_tile_starts<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), tiles_per_query.data(), tile_starts.data(), tile_size, n_queries, d_tiles_per_query_up_to_point.data()); + } +} + +void encode_anchor_query_target_pairs(const Anchor* anchors, + int32_t n_anchors, + int32_t tile_size, + device_buffer& query_target_starts, + device_buffer& query_target_lengths, + device_buffer& query_target_ends, + device_buffer& tiles_per_read, + int32_t& n_query_target_pairs, + int32_t& n_qt_tiles, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream, + int32_t block_size) +{ + AnchorToQueryTargetPairOp qt_pair_op; + cub::TransformInputIterator d_query_target_pairs(anchors, qt_pair_op); + device_buffer d_qt_pairs(n_anchors, _allocator, _cuda_stream); + device_buffer d_num_query_target_pairs(1, _allocator, _cuda_stream); + + device_buffer d_temp_buf(_allocator, _cuda_stream); + void* d_temp_storage = nullptr; + std::size_t temp_storage_bytes = 0; + + cub::DeviceRunLengthEncode::Encode(d_temp_storage, + temp_storage_bytes, + d_query_target_pairs, + d_qt_pairs.data(), + query_target_lengths.data(), + d_num_query_target_pairs.data(), + n_anchors); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceRunLengthEncode::Encode(d_temp_storage, + temp_storage_bytes, + d_query_target_pairs, + d_qt_pairs.data(), + query_target_lengths.data(), // this is the vector of encoded lengths + d_num_query_target_pairs.data(), + n_anchors); + + n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream); + + d_temp_storage = nullptr; + temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + query_target_lengths.data(), + query_target_starts.data(), + n_query_target_pairs, + _cuda_stream); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + query_target_lengths.data(), + query_target_starts.data(), + n_query_target_pairs, _cuda_stream); + + convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(), + query_target_lengths.data(), + query_target_ends.data(), + n_query_target_pairs); + + if (tile_size > 0) + { + calculate_tiles_per_read<<<(n_query_target_pairs / block_size) + 1, 32, 0, _cuda_stream>>>(query_target_starts.data(), n_query_target_pairs, tile_size, tiles_per_read.data()); + device_buffer d_n_qt_tiles(1, _allocator, _cuda_stream); + + d_temp_storage = nullptr; + temp_storage_bytes = 0; + cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs); + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs); + n_qt_tiles = cudautils::get_value_from_device(d_n_qt_tiles.data(), _cuda_stream); + } +} + +void encode_overlap_query_target_pairs(Overlap* overlaps, + int32_t n_overlaps, + device_buffer& query_target_starts, + device_buffer& query_target_lengths, + device_buffer& query_target_ends, + int32_t& n_query_target_pairs, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream, + int32_t block_size) +{ + OverlapToQueryTargetPairOp qt_pair_op; + cub::TransformInputIterator d_query_target_pairs(overlaps, qt_pair_op); + device_buffer d_qt_pairs(n_overlaps, _allocator, _cuda_stream); + device_buffer d_num_query_target_pairs(1, _allocator, _cuda_stream); + + device_buffer d_temp_buf(_allocator, _cuda_stream); + void* d_temp_storage = nullptr; + std::size_t temp_storage_bytes = 0; + + cub::DeviceRunLengthEncode::Encode(d_temp_storage, + temp_storage_bytes, + d_query_target_pairs, + d_qt_pairs.data(), + query_target_lengths.data(), + d_num_query_target_pairs.data(), + n_overlaps); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceRunLengthEncode::Encode(d_temp_storage, + temp_storage_bytes, + d_query_target_pairs, + d_qt_pairs.data(), + query_target_lengths.data(), + d_num_query_target_pairs.data(), + n_overlaps); + + n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream); + + d_temp_storage = nullptr; + temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + query_target_lengths.data(), + query_target_starts.data(), + n_query_target_pairs, _cuda_stream); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + query_target_lengths.data(), + query_target_starts.data(), + n_query_target_pairs, _cuda_stream); + + convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(), query_target_lengths.data(), query_target_ends.data(), n_query_target_pairs); +} + +} // namespace chainerutils +} // namespace cudamapper +} // namespace genomeworks +} // namespace claraparabricks \ No newline at end of file diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh new file mode 100644 index 000000000..ff077de9c --- /dev/null +++ b/cudamapper/src/chainer_utils.cuh @@ -0,0 +1,183 @@ +/* +* Copyright 2020 NVIDIA CORPORATION. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#pragma once + +#include +#include +#include + +// Needed for accumulate - remove when ported to cuda +#include +#include + +#include +#include + +namespace claraparabricks +{ + +namespace genomeworks +{ + +namespace cudamapper +{ +namespace chainerutils +{ + +#define MAX_CHAINS_PER_TILE 5 + +struct QueryTargetPair +{ + int32_t query_read_id_; + int32_t target_read_id_; + __device__ QueryTargetPair() {} +}; + +struct QueryReadID +{ + int32_t query_read_id_; + __device__ QueryReadID(){}; +}; + +// takes the anchor and returns the query read id +struct AnchorToQueryReadIDOp +{ + __device__ __forceinline__ QueryReadID operator()(const Anchor& a) const + { + QueryReadID query; + query.query_read_id_ = a.query_read_id_; + return query; + } +}; + +__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b); + +struct OverlapToQueryTargetPairOp +{ + __device__ __forceinline__ QueryTargetPair operator()(const Overlap& a) const + { + QueryTargetPair p; + p.query_read_id_ = a.query_read_id_; + p.target_read_id_ = a.target_read_id_; + return p; + } +}; + +struct AnchorToQueryTargetPairOp +{ + __device__ __forceinline__ QueryTargetPair operator()(const Anchor& a) const + { + QueryTargetPair p; + p.query_read_id_ = a.query_read_id_; + p.target_read_id_ = a.target_read_id_; + return p; + } +}; + +struct OverlapToNumResiduesOp +{ + __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const + { + return overlap.num_residues_; + } +}; + +__device__ bool +operator==(const QueryTargetPair& a, const QueryTargetPair& b); + +__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, + Overlap* overlaps, + double* scores, + bool* max_select_mask, + int32_t* predecessors, + const int32_t n_anchors, + const int32_t min_score); +/// +///@brief Allocate a 1-dimensional array representing an unrolled 2D-array +/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than +/// copy the anchors, the final array holds the indices of the global index array. +/// +///@param overlaps An array of Overlaps. Must have a well-formed num_residues_ field. +///@param unrolled_anchor_chains An array of int32_t. Will be resided on return. +///@param num_overlaps The number of overlaps in the overlaps array. +///@param _allocator The DefaultDeviceAllocator +///@param _cuda_stream The cudastream to allocate memory within. +void allocate_anchor_chains(device_buffer overlaps, + device_buffer& unrolled_anchor_chains, + const int32_t num_overlaps, + int32_t& num_total_anchors, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream); + +__global__ void output_overlap_chains(const Overlap* overlaps, + const Anchor* anchors, + const bool* select_mask, + const int32_t* predecessors, + int32_t* anchor_chains, + int32_t* anchor_chain_starts, + int32_t num_overlaps, + bool check_mask); + +__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts); + +__global__ void calculate_tile_starts(const std::int32_t* query_starts, + const std::int32_t* tiles_per_query, + std::int32_t* tile_starts, + const int32_t tile_size, + int32_t num_queries, + const std::int32_t* tiles_per_query_up_to_point); + +void encode_anchor_query_locations(const Anchor* anchors, + int32_t n_anchors, + int32_t tile_size, + device_buffer& query_starts, + device_buffer& query_lengths, + device_buffer& query_ends, + device_buffer& tiles_per_query, + device_buffer& tile_starts, + int32_t& n_queries, + int32_t& n_query_tiles, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream, + int32_t block_size); + +void encode_anchor_query_target_pairs(const Anchor* anchors, + int32_t n_anchors, + int32_t tile_size, + device_buffer& query_target_pair_starts, + device_buffer& query_target_pair_lengths, + device_buffer& query_target_pair_ends, + device_buffer& tiles_per_qt_pair, + int32_t& n_query_target_pairs, + int32_t& n_qt_tiles, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream, + int32_t block_size); + +void encode_overlap_query_target_pairs(Overlap* overlaps, + int32_t n_overlaps, + device_buffer& query_target_pair_starts, + device_buffer& query_target_pair_lengths, + device_buffer& query_target_pair_ends, + int32_t& n_query_target_pairs, + DefaultDeviceAllocator& _allocator, + cudaStream_t& _cuda_stream, + int32_t block_size); +} // namespace chainerutils +} // namespace cudamapper +} // namespace genomeworks +} // namespace claraparabricks \ No newline at end of file From 19040dcb610f4241b49f0ea684765bc0a90cd792 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 19 Oct 2020 17:37:02 -0400 Subject: [PATCH 02/26] [cudamapper] Add anchor chain trace for RLE and add anchor chain start calculation to output_overlap_chains_by_backtrace. --- cudamapper/src/chainer_utils.cu | 409 +++++-------------------------- cudamapper/src/chainer_utils.cuh | 72 ++---- 2 files changed, 76 insertions(+), 405 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 97dde816d..3b00a4c1a 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -79,6 +79,7 @@ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, void allocate_anchor_chains(device_buffer overlaps, device_buffer& unrolled_anchor_chains, + device_buffer& anchor_chain_starts, const int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator& _allocator, @@ -92,13 +93,36 @@ void allocate_anchor_chains(device_buffer overlaps, cub::TransformInputIterator d_residue_counts(overlaps.data(), overlap_residue_count_op); device_buffer d_num_total_anchors(1, _allocator, _cuda_stream); - device_buffer d_chain_starts(num_overlaps, _allocator, _cuda_stream); // Index of the first anchor in the anchors array for an overlap. - device_buffer d_chain_lengths(num_overlaps, _allocator, _cuda_stream); // Equivalent to num_residues + + cub::DeviceReduce::Sum(d_temp_storage, + temp_storage_bytes, + d_residue_counts, + d_num_total_anchors.data(), + num_overlaps, + _cuda_stream); + + d_temp_buf.clear_and_resize(temp_storage_bytes); + d_temp_storage = d_temp_buf.data(); + + cub::DeviceReduce::Sum(d_temp_storage, + temp_storage_bytes, + d_residue_counts, + d_num_total_anchors.data(), + num_overlaps, + _cuda_stream); + + d_temp_storage = nullptr; + temp_storage_bytes = 0; + + num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), _cuda_stream); + + unrolled_anchor_chains.clear_and_resize(num_total_anchors); + anchor_chain_starts.clear_and_resize(num_overlaps); cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, - d_chain_lengths.data(), - d_chain_starts.data(), + d_residue_counts, + anchor_chain_starts.data(), num_overlaps, _cuda_stream); @@ -107,26 +131,41 @@ void allocate_anchor_chains(device_buffer overlaps, cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, - d_chain_lengths.data(), - d_chain_starts.data(), + d_residue_counts, + anchor_chain_starts.data(), num_overlaps, _cuda_stream); +} - int32_t last_start = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream); - int32_t last_length = cudautils::get_value_from_device(d_num_total_anchors.data() + num_overlaps - 1, _cuda_stream); - num_total_anchors = last_start + last_length; - - unrolled_anchor_chains.clear_and_resize(num_total_anchors); +__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, + const Anchor* anchors, + const int32_t* chain_starts, + const int32_t* chain_lengths, + int32_t* anchor_chains, + int32_t* anchor_chain_starts, + int32_t num_overlaps) +{ + int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int32_t stride = blockDim.x * gridDim.x; + for (int i = d_thread_id; i < num_overlaps; i += stride) + { + int32_t chain_start = chain_starts[i]; + int32_t chain_length = chain_lengths[i]; + for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i) + { + anchor_chains[ind] = ind; + } + } } -__global__ void output_overlap_chains(const Overlap* overlaps, - const Anchor* anchors, - const bool* select_mask, - const int32_t* predecessors, - int32_t* anchor_chains, - int32_t* anchor_chain_starts, - int32_t num_overlaps, - bool check_mask) +__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, + const Anchor* anchors, + const bool* select_mask, + const int32_t* predecessors, + int32_t* anchor_chains, + int32_t* anchor_chain_starts, + int32_t num_overlaps, + bool check_mask) { int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int32_t stride = blockDim.x * gridDim.x; @@ -188,12 +227,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, } Anchor first_anchor = anchors[first_index]; overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain); - // Overlap final_overlap = overlaps[global_overlap_index]; - // printf("%d %d %d %d %d %d %d %f\n", - // final_overlap.query_read_id_, final_overlap.query_start_position_in_read_, final_overlap.query_end_position_in_read_, - // final_overlap.target_read_id_, final_overlap.target_start_position_in_read_, final_overlap.target_end_position_in_read_, - // final_overlap.num_residues_, - // final_score); } else { @@ -202,332 +235,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, } } -__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts) -{ - std::int32_t d_tid = blockIdx.x * blockDim.x + threadIdx.x; - if (d_tid < n_starts) - { - ends[d_tid] = starts[d_tid] + lengths[d_tid] - 1; - } -} - -// we take each query that is encoded as length (ie there are 4 queries that have the same id) -// so we take the (how many queries are in that section) and divide it by the size of the tile to get the -// number of tiles for that query -__global__ void calculate_tiles_per_read(const std::int32_t* lengths, - const int32_t num_reads, - const int32_t tile_size, - std::int32_t* tiles_per_read) -{ - int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - if (d_thread_id < num_reads) - { - // if the number of queries in that section are not evenly divisible, we need an extra block - // to accomadate for the leftovers - int32_t n_integer_blocks = lengths[d_thread_id] / tile_size; - int32_t remainder = lengths[d_thread_id] % tile_size; - tiles_per_read[d_thread_id] = remainder == 0 ? n_integer_blocks : n_integer_blocks + 1; - } -} - -// TODO VI: threads may be overwriting each other. -// because the number of queries is greater than number of tiles, each thread here has to write the query starts -// for multiple tiles -__global__ void calculate_tile_starts(const std::int32_t* query_starts, - const std::int32_t* tiles_per_query, - std::int32_t* tile_starts, - const int32_t tile_size, - const int32_t num_queries, - const std::int32_t* tiles_per_query_up_to_point) -{ - int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int32_t stride = blockDim.x * gridDim.x; - if (d_thread_id < num_queries) - { - // for each tile, we look up the query it corresponds to and offset it by the which tile in the query - // we're at multiplied by the total size of the tile - // TODO VI: this memory access pattern seems a little off? Thread i and thread i+1 would overwrite eachother, no? - for (int i = 0; i < tiles_per_query[d_thread_id]; i++) - { - //finds the offset in the ragged array and offsets to find start of "next" sub array - tile_starts[tiles_per_query_up_to_point[d_thread_id] + i] = query_starts[d_thread_id] + (i * tile_size); - } - } - //int counter = 0; - //for (int i =0; i < num_queries; i++) - //{ - // for (int j = 0; j < tiles_per_query[i]; j++) - // { - // tile_starts[counter] = query_starts[i] + (j * tile_size); - // counter++; - // } - //} -} - -void encode_anchor_query_locations(const Anchor* anchors, - int32_t n_anchors, - int32_t tile_size, - device_buffer& query_starts, - device_buffer& query_lengths, - device_buffer& query_ends, - device_buffer& tiles_per_query, - device_buffer& tile_starts, - int32_t& n_queries, - int32_t& n_query_tiles, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream, - int32_t block_size) -{ - AnchorToQueryReadIDOp anchor_to_read_op; - // This takes anchors and outputs and converts the anchors to QueryReadID types (references) - cub::TransformInputIterator d_queries(anchors, anchor_to_read_op); - // create buffer of size number of anchors - device_buffer d_query_read_ids(n_anchors, _allocator, _cuda_stream); - // vector of number of read ids...? I don't know what this is for - // This is used to store the length of the encoded sequence - device_buffer d_num_query_read_ids(1, _allocator, _cuda_stream); - - // don't know what this is for yet - // this is for internal use for run length encoding - device_buffer d_temp_buf(_allocator, _cuda_stream); - void* d_temp_storage = nullptr; - std::size_t temp_storage_bytes = 0; - - cub::DeviceRunLengthEncode::Encode(d_temp_storage, - temp_storage_bytes, - d_queries, - d_query_read_ids.data(), - query_lengths.data(), - d_num_query_read_ids.data(), - n_anchors); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - - cub::DeviceRunLengthEncode::Encode(d_temp_storage, - temp_storage_bytes, - d_queries, - d_query_read_ids.data(), - query_lengths.data(), // this is the vector of encoded lengths - d_num_query_read_ids.data(), - n_anchors); - // this is just the "length" of the encoded sequence - n_queries = cudautils::get_value_from_device(d_num_query_read_ids.data(), _cuda_stream); - d_temp_storage = nullptr; - temp_storage_bytes = 0; - - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - query_lengths.data(), // this is the vector of encoded lengths - query_starts.data(), // at this point, this vector is empty - n_queries, - _cuda_stream); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - - // this gives us the number of queries up to that point. eg How many query starts we have at each index - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - query_lengths.data(), // this is the vector of encoded lengths - query_starts.data(), - n_queries, - _cuda_stream); - - // paper uses the ends and finds the beginnings with x - w + 1, are we converting to that here? - // TODO VI: I'm not entirely sure what this is for? I think we want to change the read query - // (defined by [query_start, query_start + query_length] to [query_end - query_length + 1, query_end]) - // The above () is NOT true - convert_offsets_to_ends<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), // this gives how many starts at each index - query_lengths.data(), // this is the vector of encoded lengths - query_ends.data(), - n_queries); - - // what case is this? A: This always runs because TILE_SIZE is fixed at 1024 - if (tile_size > 0) - { - calculate_tiles_per_read<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_lengths.data(), n_queries, tile_size, tiles_per_query.data()); - // This stores the total number of tiles that we need, calculated with the below Sum() - device_buffer d_n_query_tiles(1, _allocator, _cuda_stream); - - d_temp_storage = nullptr; - temp_storage_bytes = 0; - cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_query.data(), d_n_query_tiles.data(), n_queries); - - // this is the length of tile_starts - n_query_tiles = cudautils::get_value_from_device(d_n_query_tiles.data(), _cuda_stream); - - // This is used to calculate the offsets for tile_starts - device_buffer d_tiles_per_query_up_to_point(n_queries, _allocator, _cuda_stream); - - d_temp_storage = nullptr; - temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - tiles_per_query.data(), // this is the vector of encoded lengths - d_tiles_per_query_up_to_point.data(), - n_queries, - _cuda_stream); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - tiles_per_query.data(), // this is the vector of encoded lengths - d_tiles_per_query_up_to_point.data(), - n_queries, - _cuda_stream); - - calculate_tile_starts<<<(n_queries / block_size) + 1, block_size, 0, _cuda_stream>>>(query_starts.data(), tiles_per_query.data(), tile_starts.data(), tile_size, n_queries, d_tiles_per_query_up_to_point.data()); - } -} - -void encode_anchor_query_target_pairs(const Anchor* anchors, - int32_t n_anchors, - int32_t tile_size, - device_buffer& query_target_starts, - device_buffer& query_target_lengths, - device_buffer& query_target_ends, - device_buffer& tiles_per_read, - int32_t& n_query_target_pairs, - int32_t& n_qt_tiles, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream, - int32_t block_size) -{ - AnchorToQueryTargetPairOp qt_pair_op; - cub::TransformInputIterator d_query_target_pairs(anchors, qt_pair_op); - device_buffer d_qt_pairs(n_anchors, _allocator, _cuda_stream); - device_buffer d_num_query_target_pairs(1, _allocator, _cuda_stream); - - device_buffer d_temp_buf(_allocator, _cuda_stream); - void* d_temp_storage = nullptr; - std::size_t temp_storage_bytes = 0; - - cub::DeviceRunLengthEncode::Encode(d_temp_storage, - temp_storage_bytes, - d_query_target_pairs, - d_qt_pairs.data(), - query_target_lengths.data(), - d_num_query_target_pairs.data(), - n_anchors); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - - cub::DeviceRunLengthEncode::Encode(d_temp_storage, - temp_storage_bytes, - d_query_target_pairs, - d_qt_pairs.data(), - query_target_lengths.data(), // this is the vector of encoded lengths - d_num_query_target_pairs.data(), - n_anchors); - - n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream); - - d_temp_storage = nullptr; - temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - query_target_lengths.data(), - query_target_starts.data(), - n_query_target_pairs, - _cuda_stream); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - query_target_lengths.data(), - query_target_starts.data(), - n_query_target_pairs, _cuda_stream); - - convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(), - query_target_lengths.data(), - query_target_ends.data(), - n_query_target_pairs); - - if (tile_size > 0) - { - calculate_tiles_per_read<<<(n_query_target_pairs / block_size) + 1, 32, 0, _cuda_stream>>>(query_target_starts.data(), n_query_target_pairs, tile_size, tiles_per_read.data()); - device_buffer d_n_qt_tiles(1, _allocator, _cuda_stream); - - d_temp_storage = nullptr; - temp_storage_bytes = 0; - cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs); - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, tiles_per_read.data(), d_n_qt_tiles.data(), n_query_target_pairs); - n_qt_tiles = cudautils::get_value_from_device(d_n_qt_tiles.data(), _cuda_stream); - } -} - -void encode_overlap_query_target_pairs(Overlap* overlaps, - int32_t n_overlaps, - device_buffer& query_target_starts, - device_buffer& query_target_lengths, - device_buffer& query_target_ends, - int32_t& n_query_target_pairs, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream, - int32_t block_size) -{ - OverlapToQueryTargetPairOp qt_pair_op; - cub::TransformInputIterator d_query_target_pairs(overlaps, qt_pair_op); - device_buffer d_qt_pairs(n_overlaps, _allocator, _cuda_stream); - device_buffer d_num_query_target_pairs(1, _allocator, _cuda_stream); - - device_buffer d_temp_buf(_allocator, _cuda_stream); - void* d_temp_storage = nullptr; - std::size_t temp_storage_bytes = 0; - - cub::DeviceRunLengthEncode::Encode(d_temp_storage, - temp_storage_bytes, - d_query_target_pairs, - d_qt_pairs.data(), - query_target_lengths.data(), - d_num_query_target_pairs.data(), - n_overlaps); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - - cub::DeviceRunLengthEncode::Encode(d_temp_storage, - temp_storage_bytes, - d_query_target_pairs, - d_qt_pairs.data(), - query_target_lengths.data(), - d_num_query_target_pairs.data(), - n_overlaps); - - n_query_target_pairs = cudautils::get_value_from_device(d_num_query_target_pairs.data(), _cuda_stream); - - d_temp_storage = nullptr; - temp_storage_bytes = 0; - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - query_target_lengths.data(), - query_target_starts.data(), - n_query_target_pairs, _cuda_stream); - - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); - - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - query_target_lengths.data(), - query_target_starts.data(), - n_query_target_pairs, _cuda_stream); - - convert_offsets_to_ends<<<(n_query_target_pairs / block_size) + 1, block_size, 0, _cuda_stream>>>(query_target_starts.data(), query_target_lengths.data(), query_target_ends.data(), n_query_target_pairs); -} - } // namespace chainerutils } // namespace cudamapper } // namespace genomeworks diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index ff077de9c..8f4b09180 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -118,65 +118,29 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, ///@param _cuda_stream The cudastream to allocate memory within. void allocate_anchor_chains(device_buffer overlaps, device_buffer& unrolled_anchor_chains, + device_buffer& anchor_chain_starts, const int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator& _allocator, cudaStream_t& _cuda_stream); -__global__ void output_overlap_chains(const Overlap* overlaps, - const Anchor* anchors, - const bool* select_mask, - const int32_t* predecessors, - int32_t* anchor_chains, - int32_t* anchor_chain_starts, - int32_t num_overlaps, - bool check_mask); - -__global__ void convert_offsets_to_ends(std::int32_t* starts, std::int32_t* lengths, std::int32_t* ends, std::int32_t n_starts); - -__global__ void calculate_tile_starts(const std::int32_t* query_starts, - const std::int32_t* tiles_per_query, - std::int32_t* tile_starts, - const int32_t tile_size, - int32_t num_queries, - const std::int32_t* tiles_per_query_up_to_point); - -void encode_anchor_query_locations(const Anchor* anchors, - int32_t n_anchors, - int32_t tile_size, - device_buffer& query_starts, - device_buffer& query_lengths, - device_buffer& query_ends, - device_buffer& tiles_per_query, - device_buffer& tile_starts, - int32_t& n_queries, - int32_t& n_query_tiles, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream, - int32_t block_size); - -void encode_anchor_query_target_pairs(const Anchor* anchors, - int32_t n_anchors, - int32_t tile_size, - device_buffer& query_target_pair_starts, - device_buffer& query_target_pair_lengths, - device_buffer& query_target_pair_ends, - device_buffer& tiles_per_qt_pair, - int32_t& n_query_target_pairs, - int32_t& n_qt_tiles, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream, - int32_t block_size); - -void encode_overlap_query_target_pairs(Overlap* overlaps, - int32_t n_overlaps, - device_buffer& query_target_pair_starts, - device_buffer& query_target_pair_lengths, - device_buffer& query_target_pair_ends, - int32_t& n_query_target_pairs, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream, - int32_t block_size); +__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, + const Anchor* anchors, + const bool* select_mask, + const int32_t* predecessors, + int32_t* anchor_chains, + int32_t* anchor_chain_starts, + int32_t num_overlaps, + bool check_mask); + +__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, + const Anchor* anchors, + const int32_t* chain_starts, + const int32_t* chain_lengths, + int32_t* anchor_chains, + int32_t* anchor_chain_starts, + int32_t num_overlaps); + } // namespace chainerutils } // namespace cudamapper } // namespace genomeworks From 233cfc22e82ca3dea34d58c4fc83f404d230dcd9 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 19 Oct 2020 17:40:08 -0400 Subject: [PATCH 03/26] [cudamapper] Clean up unnecessary functions in chainer_utils. --- cudamapper/src/chainer_utils.cu | 10 ------ cudamapper/src/chainer_utils.cuh | 53 -------------------------------- 2 files changed, 63 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 3b00a4c1a..e2af3d443 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -40,16 +40,6 @@ namespace cudamapper namespace chainerutils { -__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b) -{ - return a.query_read_id_ == b.query_read_id_ && a.target_read_id_ == b.target_read_id_; -} - -__device__ bool operator==(const QueryReadID& a, const QueryReadID& b) -{ - return a.query_read_id_ == b.query_read_id_; -} - __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) { Overlap overlap; diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 8f4b09180..f2862920c 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -38,56 +38,6 @@ namespace cudamapper namespace chainerutils { -#define MAX_CHAINS_PER_TILE 5 - -struct QueryTargetPair -{ - int32_t query_read_id_; - int32_t target_read_id_; - __device__ QueryTargetPair() {} -}; - -struct QueryReadID -{ - int32_t query_read_id_; - __device__ QueryReadID(){}; -}; - -// takes the anchor and returns the query read id -struct AnchorToQueryReadIDOp -{ - __device__ __forceinline__ QueryReadID operator()(const Anchor& a) const - { - QueryReadID query; - query.query_read_id_ = a.query_read_id_; - return query; - } -}; - -__device__ bool operator==(const QueryTargetPair& a, const QueryTargetPair& b); - -struct OverlapToQueryTargetPairOp -{ - __device__ __forceinline__ QueryTargetPair operator()(const Overlap& a) const - { - QueryTargetPair p; - p.query_read_id_ = a.query_read_id_; - p.target_read_id_ = a.target_read_id_; - return p; - } -}; - -struct AnchorToQueryTargetPairOp -{ - __device__ __forceinline__ QueryTargetPair operator()(const Anchor& a) const - { - QueryTargetPair p; - p.query_read_id_ = a.query_read_id_; - p.target_read_id_ = a.target_read_id_; - return p; - } -}; - struct OverlapToNumResiduesOp { __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const @@ -96,9 +46,6 @@ struct OverlapToNumResiduesOp } }; -__device__ bool -operator==(const QueryTargetPair& a, const QueryTargetPair& b); - __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, Overlap* overlaps, double* scores, From 1cc1c25b82fb0f0c85badb66937ccb6d3acf57a4 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 19 Oct 2020 22:54:25 -0400 Subject: [PATCH 04/26] [cudamapper] Add tests for allocating anchor chains in chainer_utils. However, tests fail because the allocator used in testing can't actually allocate memory. --- cudamapper/src/chainer_utils.cu | 6 +++--- cudamapper/src/chainer_utils.cuh | 8 ++++++-- cudamapper/tests/CMakeLists.txt | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index e2af3d443..16f0f6d72 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -40,7 +40,7 @@ namespace cudamapper namespace chainerutils { -__device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) +__host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) { Overlap overlap; overlap.num_residues_ = num_anchors; @@ -67,10 +67,10 @@ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, return overlap; } -void allocate_anchor_chains(device_buffer overlaps, +void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - const int32_t num_overlaps, + int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator& _allocator, cudaStream_t& _cuda_stream) diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index f2862920c..5b5ca7366 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -46,6 +46,10 @@ struct OverlapToNumResiduesOp } }; +__host__ __device__ Overlap create_simple_overlap(const Anchor& start, + const Anchor& end, + const int32_t num_anchors); + __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, Overlap* overlaps, double* scores, @@ -63,10 +67,10 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, ///@param num_overlaps The number of overlaps in the overlaps array. ///@param _allocator The DefaultDeviceAllocator ///@param _cuda_stream The cudastream to allocate memory within. -void allocate_anchor_chains(device_buffer overlaps, +void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - const int32_t num_overlaps, + int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator& _allocator, cudaStream_t& _cuda_stream); diff --git a/cudamapper/tests/CMakeLists.txt b/cudamapper/tests/CMakeLists.txt index da8617978..acfb03a00 100644 --- a/cudamapper/tests/CMakeLists.txt +++ b/cudamapper/tests/CMakeLists.txt @@ -29,6 +29,7 @@ set(SOURCES Test_CudamapperOverlapper.cpp Test_CudamapperOverlapperTriggered.cu Test_CudamapperUtilsKmerFunctions.cpp + Test_CudamapperChainerUtils.cu ) get_property(cudamapper_data_include_dir GLOBAL PROPERTY cudamapper_data_include_dir) From 6680ba6f44a80990db50816a78451d56c886a77a Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Tue, 20 Oct 2020 11:01:20 -0400 Subject: [PATCH 05/26] Add tests for chainerutils. --- .../tests/Test_CudamapperChainerUtils.cu | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 cudamapper/tests/Test_CudamapperChainerUtils.cu diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu new file mode 100644 index 000000000..6348c1602 --- /dev/null +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -0,0 +1,139 @@ +/* +* Copyright 2019-2020 NVIDIA CORPORATION. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "gtest/gtest.h" + +#include +#include +#include + +#include +#include "../src/chainer_utils.cuh" + +namespace claraparabricks +{ + +namespace genomeworks +{ + +namespace cudamapper +{ + +TEST(TestChainerUtils, Create_Simple_Overlap_Tests) +{ + Anchor a; + a.query_read_id_ = 12; + a.query_position_in_read_ = 130; + a.target_read_id_ = 46; + a.target_position_in_read_ = 10320; + + Anchor b; + b.query_read_id_ = 12; + b.query_position_in_read_ = 10000; + b.target_read_id_ = 46; + b.target_position_in_read_ = 20400; + + Overlap p_ab = chainerutils::create_simple_overlap(a, b, 12); + ASSERT_EQ(p_ab.query_read_id_, static_cast(12)); + ASSERT_EQ(p_ab.target_read_id_, 46); + ASSERT_EQ(p_ab.query_start_position_in_read_, 130); + ASSERT_EQ(p_ab.query_end_position_in_read_, 10000); + ASSERT_EQ(p_ab.target_start_position_in_read_, 10320); + ASSERT_EQ(p_ab.num_residues_, 12); + ASSERT_EQ(static_cast(p_ab.relative_strand), static_cast(RelativeStrand::Forward)); + + Anchor c; + c.query_read_id_ = 12; + c.query_position_in_read_ = 15000; + c.target_read_id_ = 46; + c.target_position_in_read_ = 16000; + + Overlap p_bc = chainerutils::create_simple_overlap(b, c, 22); + ASSERT_EQ(p_bc.target_start_position_in_read_, 16000); + ASSERT_EQ(p_bc.target_end_position_in_read_, 20400); + ASSERT_EQ(static_cast(p_bc.relative_strand), static_cast(RelativeStrand::Reverse)); +} + +TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) +{ + + DefaultDeviceAllocator allocator = create_default_device_allocator(2048); + + CudaStream cuda_stream = make_cuda_stream(); + auto cu_ptr = cuda_stream.get(); + + Anchor a; + a.query_read_id_ = 12; + a.query_position_in_read_ = 130; + a.target_read_id_ = 46; + a.target_position_in_read_ = 10320; + + Anchor b; + b.query_read_id_ = 12; + b.query_position_in_read_ = 10000; + b.target_read_id_ = 46; + b.target_position_in_read_ = 20400; + + Overlap p_ab = chainerutils::create_simple_overlap(a, b, 2); + + Anchor c; + c.query_read_id_ = 12; + c.query_position_in_read_ = 15000; + c.target_read_id_ = 46; + c.target_position_in_read_ = 16000; + + Overlap p_bc = chainerutils::create_simple_overlap(b, c, 2); + + std::vector anchors; + anchors.push_back(a); + anchors.push_back(b); + anchors.push_back(c); + + std::vector overlaps; + overlaps.push_back(p_ab); + overlaps.push_back(p_bc); + + device_buffer d_anchors(anchors.size(), allocator, cuda_stream.get()); + device_buffer d_overlaps(overlaps.size(), allocator, cuda_stream.get()); + cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data()); + cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data()); + + // device_buffer unrolled_anchor_chains(1, allocator, cuda_stream.get()); + // device_buffer chain_starts(1, allocator, cuda_stream.get()); + + device_buffer unrolled_anchor_chains; + device_buffer chain_starts; + + int32_t num_total_anchors; + chainerutils::allocate_anchor_chains(d_overlaps, + unrolled_anchor_chains, + chain_starts, + overlaps.size(), + num_total_anchors, + allocator, + cu_ptr); + // void allocate_anchor_chains(device_buffer overlaps, + // device_buffer& unrolled_anchor_chains, + // device_buffer& anchor_chain_starts, + // const int32_t num_overlaps, + // int32_t& num_total_anchors, + // DefaultDeviceAllocator& _allocator, + // cudaStream_t& _cuda_stream); +} + +} // namespace cudamapper +} // namespace genomeworks +} // namespace claraparabricks \ No newline at end of file From 5860fb1eca04f35de5fb1fd66fc38424cdf38e96 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Thu, 22 Oct 2020 09:00:39 -0400 Subject: [PATCH 06/26] [cudamapper] Remove unnecessary includes. --- cudamapper/src/chainer_utils.cu | 10 ---------- cudamapper/src/chainer_utils.cuh | 8 -------- 2 files changed, 18 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 16f0f6d72..aabe59e6f 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -16,17 +16,7 @@ #include "chainer_utils.cuh" -#include -#include -#include - -// Needed for accumulate - remove when ported to cuda -#include -#include - #include -#include - #include namespace claraparabricks diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 5b5ca7366..b13b0c664 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -16,14 +16,6 @@ #pragma once -#include -#include -#include - -// Needed for accumulate - remove when ported to cuda -#include -#include - #include #include From ef4ce77f532580589ed71608e8207ba99c882e42 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Thu, 22 Oct 2020 09:49:21 -0400 Subject: [PATCH 07/26] [chainer_utils] Update docs. --- cudamapper/src/chainer_utils.cu | 2 +- cudamapper/src/chainer_utils.cuh | 82 +++++++++++++++++++++++++++++--- 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index aabe59e6f..d8d6438c1 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -41,7 +41,7 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anc overlap.query_start_position_in_read_ = min(start.query_position_in_read_, end.query_position_in_read_); overlap.query_end_position_in_read_ = max(start.query_position_in_read_, end.query_position_in_read_); - bool is_negative_strand = end.target_position_in_read_ < start.target_position_in_read_; + const bool is_negative_strand = end.target_position_in_read_ < start.target_position_in_read_; if (is_negative_strand) { overlap.relative_strand = RelativeStrand::Reverse; diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index b13b0c664..1ac07973e 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -38,10 +38,33 @@ struct OverlapToNumResiduesOp } }; +/// +/// \brief Create an overlap from the first and last anchor in the chain and +/// the total number of anchors in the chain. +/// +/// \param start The first anchor in the chain. +/// \param end The last anchor in the chain. +/// \param num_anchors The total number of anchors in the chain. __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors); +/// +/// \brief Perform a backtrace on the predecessors array +/// to generate overlaps from anchors. Anchors must have been chained +/// by a chaining function that filles the predecessors and scores array. +/// +/// \param anchors An array of anchors. +/// \param overlaps An array of overlaps to be filled. +/// \param scores An array of scores. Only chains with a score greater than +/// min_score will be backtraced. +/// \param max_select_mask A boolean mask, used to mask out any subchains during +/// the backtrace. +/// \param predecessors An array of indices into the anchors array marking the predecessor +/// of each anchor within a chain. +/// \param n_anchors The number of anchors. +/// \param min_score The minimum score of a chain for performing backtracing. +/// \return __global__ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, Overlap* overlaps, double* scores, @@ -50,15 +73,31 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, const int32_t n_anchors, const int32_t min_score); /// -///@brief Allocate a 1-dimensional array representing an unrolled 2D-array +/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array /// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than -/// copy the anchors, the final array holds the indices of the global index array. +/// copy the anchors, the final array holds the indices within the anchors array +/// of the anchors in the chain. /// -///@param overlaps An array of Overlaps. Must have a well-formed num_residues_ field. -///@param unrolled_anchor_chains An array of int32_t. Will be resided on return. -///@param num_overlaps The number of overlaps in the overlaps array. +/// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field. +/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return. +/// param num_overlaps The number of overlaps in the overlaps array. ///@param _allocator The DefaultDeviceAllocator ///@param _cuda_stream The cudastream to allocate memory within. + +/// +/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array +/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than +/// copy the anchors, the final array holds the indices within the anchors array +/// of the anchors in the chain. +/// +/// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field +/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return. +/// \param anchor_chain_starts An array holding the index in the anchors array of the first +/// anchor in an overlap. +/// \param num_overlaps The number of overlaps in the overlaps array. +/// \param num_total_anchors The number of anchors in the anchors array. +/// \param _allocator The DefaultDeviceAllocator for this overlapper. +/// \param _cuda_stream The cudastream to allocate memory within. void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, @@ -67,6 +106,22 @@ void allocate_anchor_chains(device_buffer& overlaps, DefaultDeviceAllocator& _allocator, cudaStream_t& _cuda_stream); +/// +/// \brief Calculate the anchors chains used to produce each overlap in the +/// overlap array for anchors chained by backtrace_anchors_to_overlaps. +/// +/// \param overlaps An array of overlaps. +/// \param anchors The array of anchors used to generate overlaps. +/// \param select_mask An array of bools, used to mask overlaps from output. +/// \param predecessors The predecessors array from anchor chaining. +/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold +/// the indices of anchors within each chain. +/// \param anchor_chain_starts An array which holds the indices of the first anchor for +/// each overlap in the overlaps array. +/// \param num_overlaps The number of overlaps in the overlaps array +/// \param check_mask A boolean. If true, only overlaps where select_mask is true +/// will have their anchor chains calculated. +/// \return __global__ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, const Anchor* anchors, const bool* select_mask, @@ -75,7 +130,22 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, int32_t* anchor_chain_starts, int32_t num_overlaps, bool check_mask); - +/// +/// \brief Calculate the anchors chains used to produce each overlap in the +/// overlap array for anchors chained by RLE. +/// +/// \param overlaps An array of overlaps. +/// \param anchors The array of anchors used to generate overlaps. +/// \param chain_starts An array which holds the indices of the first anchor for +/// each overlap in the overlaps array. +/// \param chain_lengths An array which holds the length of each run of anchors, corresponding to +/// the chain_starts array. +/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold +/// the indices of anchors within each chain. +/// \param anchor_chain_starts An array which holds the indices of the first anchor for +/// each overlap in the overlaps array. +/// \param num_overlaps The number of overlaps in the overlaps array. +/// \return __global__ __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, const Anchor* anchors, const int32_t* chain_starts, From 2a864de10d3f982d669557735b27a8f19c534815 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Thu, 22 Oct 2020 09:50:46 -0400 Subject: [PATCH 08/26] [chainer_utils] Fix minor doc typos. --- cudamapper/src/chainer_utils.cuh | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 1ac07973e..80d65b7a4 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -72,17 +72,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, int32_t* predecessors, const int32_t n_anchors, const int32_t min_score); -/// -/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array -/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than -/// copy the anchors, the final array holds the indices within the anchors array -/// of the anchors in the chain. -/// -/// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field. -/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return. -/// param num_overlaps The number of overlaps in the overlaps array. -///@param _allocator The DefaultDeviceAllocator -///@param _cuda_stream The cudastream to allocate memory within. /// /// \brief Allocate a 1-dimensional array representing an unrolled 2D-array From 39f7232b73e30bc67a9cbd2a339b1964a2f6966c Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 26 Oct 2020 20:57:57 -0400 Subject: [PATCH 09/26] [docs] Fix typos and doxygen docs in chainer_utils. --- cudamapper/src/chainer_utils.cu | 8 +++ cudamapper/src/chainer_utils.cuh | 50 +++++-------------- .../tests/Test_CudamapperChainerUtils.cu | 10 ---- 3 files changed, 20 insertions(+), 48 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index d8d6438c1..e5b68258e 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -30,6 +30,14 @@ namespace cudamapper namespace chainerutils { +struct OverlapToNumResiduesOp +{ + __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const + { + return overlap.num_residues_; + } +}; + __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) { Overlap overlap; diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 80d65b7a4..df3ae09b1 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -30,15 +30,6 @@ namespace cudamapper namespace chainerutils { -struct OverlapToNumResiduesOp -{ - __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const - { - return overlap.num_residues_; - } -}; - -/// /// \brief Create an overlap from the first and last anchor in the chain and /// the total number of anchors in the chain. /// @@ -49,22 +40,17 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors); -/// /// \brief Perform a backtrace on the predecessors array /// to generate overlaps from anchors. Anchors must have been chained /// by a chaining function that filles the predecessors and scores array. /// /// \param anchors An array of anchors. /// \param overlaps An array of overlaps to be filled. -/// \param scores An array of scores. Only chains with a score greater than -/// min_score will be backtraced. -/// \param max_select_mask A boolean mask, used to mask out any subchains during -/// the backtrace. -/// \param predecessors An array of indices into the anchors array marking the predecessor -/// of each anchor within a chain. +/// \param scores An array of scores. Only chains with a score greater than min_score will be backtraced. +/// \param max_select_mask A boolean mask, used to mask out any subchains during the backtrace. +/// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain. /// \param n_anchors The number of anchors. /// \param min_score The minimum score of a chain for performing backtracing. -/// \return __global__ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, Overlap* overlaps, double* scores, @@ -73,7 +59,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, const int32_t n_anchors, const int32_t min_score); -/// /// \brief Allocate a 1-dimensional array representing an unrolled 2D-array /// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than /// copy the anchors, the final array holds the indices within the anchors array @@ -81,8 +66,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, /// /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field /// \param unrolled_anchor_chains An array of int32_t. Will be resided on return. -/// \param anchor_chain_starts An array holding the index in the anchors array of the first -/// anchor in an overlap. +/// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap. /// \param num_overlaps The number of overlaps in the overlaps array. /// \param num_total_anchors The number of anchors in the anchors array. /// \param _allocator The DefaultDeviceAllocator for this overlapper. @@ -95,7 +79,6 @@ void allocate_anchor_chains(device_buffer& overlaps, DefaultDeviceAllocator& _allocator, cudaStream_t& _cuda_stream); -/// /// \brief Calculate the anchors chains used to produce each overlap in the /// overlap array for anchors chained by backtrace_anchors_to_overlaps. /// @@ -103,14 +86,10 @@ void allocate_anchor_chains(device_buffer& overlaps, /// \param anchors The array of anchors used to generate overlaps. /// \param select_mask An array of bools, used to mask overlaps from output. /// \param predecessors The predecessors array from anchor chaining. -/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold -/// the indices of anchors within each chain. -/// \param anchor_chain_starts An array which holds the indices of the first anchor for -/// each overlap in the overlaps array. +/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain. +/// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array. /// \param num_overlaps The number of overlaps in the overlaps array -/// \param check_mask A boolean. If true, only overlaps where select_mask is true -/// will have their anchor chains calculated. -/// \return __global__ +/// \param check_mask A boolean. If true, only overlaps where select_mask is true will have their anchor chains calculated. __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, const Anchor* anchors, const bool* select_mask, @@ -119,22 +98,17 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, int32_t* anchor_chain_starts, int32_t num_overlaps, bool check_mask); -/// + /// \brief Calculate the anchors chains used to produce each overlap in the /// overlap array for anchors chained by RLE. /// /// \param overlaps An array of overlaps. /// \param anchors The array of anchors used to generate overlaps. -/// \param chain_starts An array which holds the indices of the first anchor for -/// each overlap in the overlaps array. -/// \param chain_lengths An array which holds the length of each run of anchors, corresponding to -/// the chain_starts array. -/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold -/// the indices of anchors within each chain. -/// \param anchor_chain_starts An array which holds the indices of the first anchor for -/// each overlap in the overlaps array. +/// \param chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array. +/// \param chain_lengths An array which holds the length of each run of anchors, corresponding to the chain_starts array. +/// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain. +/// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array. /// \param num_overlaps The number of overlaps in the overlaps array. -/// \return __global__ __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, const Anchor* anchors, const int32_t* chain_starts, diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index 6348c1602..666f3f459 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -111,9 +111,6 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data()); cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data()); - // device_buffer unrolled_anchor_chains(1, allocator, cuda_stream.get()); - // device_buffer chain_starts(1, allocator, cuda_stream.get()); - device_buffer unrolled_anchor_chains; device_buffer chain_starts; @@ -125,13 +122,6 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) num_total_anchors, allocator, cu_ptr); - // void allocate_anchor_chains(device_buffer overlaps, - // device_buffer& unrolled_anchor_chains, - // device_buffer& anchor_chain_starts, - // const int32_t num_overlaps, - // int32_t& num_total_anchors, - // DefaultDeviceAllocator& _allocator, - // cudaStream_t& _cuda_stream); } } // namespace cudamapper From 8d7d1efc80b35a6fc9b0fb84c34474ddd1229c94 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 26 Oct 2020 21:12:25 -0400 Subject: [PATCH 10/26] Fix compile errors from missing imports --- cudaaligner/samples/sample_cudaaligner.cpp | 1 + cudaextender/samples/sample_cudaextender.cpp | 1 + cudamapper/samples/sample_cudamapper.cpp | 1 + cudamapper/src/chainer_utils.cu | 2 +- 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cudaaligner/samples/sample_cudaaligner.cpp b/cudaaligner/samples/sample_cudaaligner.cpp index db147faab..f236ae2b3 100644 --- a/cudaaligner/samples/sample_cudaaligner.cpp +++ b/cudaaligner/samples/sample_cudaaligner.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include diff --git a/cudaextender/samples/sample_cudaextender.cpp b/cudaextender/samples/sample_cudaextender.cpp index 87ad85793..53d4bd8e4 100644 --- a/cudaextender/samples/sample_cudaextender.cpp +++ b/cudaextender/samples/sample_cudaextender.cpp @@ -26,6 +26,7 @@ #include #include #include +#include using namespace claraparabricks::genomeworks; using namespace cudautils; diff --git a/cudamapper/samples/sample_cudamapper.cpp b/cudamapper/samples/sample_cudamapper.cpp index cfe2d59f9..65151d16d 100644 --- a/cudamapper/samples/sample_cudamapper.cpp +++ b/cudamapper/samples/sample_cudamapper.cpp @@ -29,6 +29,7 @@ #include #include #include +#include // define constants. See cudamapper/src/application_parameters.hpp for more. // constants used in multiple places diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index e5b68258e..7066b3aa1 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -127,7 +127,7 @@ void allocate_anchor_chains(device_buffer& overlaps, __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, const Anchor* anchors, - const int32_t* chain_starts, + const int32_t* const chain_starts, const int32_t* chain_lengths, int32_t* anchor_chains, int32_t* anchor_chain_starts, From 756739f749ad8e93788d50377bd09a88d7494826 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 26 Oct 2020 21:28:19 -0400 Subject: [PATCH 11/26] [chainer_utils] Make arguments const / const * const where possible. --- cudamapper/src/chainer_utils.cu | 46 ++++++++++++++++---------------- cudamapper/src/chainer_utils.cuh | 42 ++++++++++++++--------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 7066b3aa1..3280f6501 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -68,7 +68,7 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anc void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - int32_t num_overlaps, + const int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator& _allocator, cudaStream_t& _cuda_stream) @@ -125,16 +125,16 @@ void allocate_anchor_chains(device_buffer& overlaps, _cuda_stream); } -__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, - const Anchor* anchors, +__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, + const Anchor* const anchors, const int32_t* const chain_starts, - const int32_t* chain_lengths, - int32_t* anchor_chains, - int32_t* anchor_chain_starts, - int32_t num_overlaps) + const int32_t* const chain_lengths, + int32_t* const anchor_chains, + int32_t* const anchor_chain_starts, + const int32_t num_overlaps) { - int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int32_t stride = blockDim.x * gridDim.x; + const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; for (int i = d_thread_id; i < num_overlaps; i += stride) { int32_t chain_start = chain_starts[i]; @@ -146,17 +146,17 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, } } -__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, - const Anchor* anchors, - const bool* select_mask, - const int32_t* predecessors, - int32_t* anchor_chains, - int32_t* anchor_chain_starts, - int32_t num_overlaps, - bool check_mask) +__global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps, + const Anchor* const anchors, + const bool* const select_mask, + const int32_t* const predecessors, + int32_t* const anchor_chains, + int32_t* const anchor_chain_starts, + const int32_t num_overlaps, + const bool check_mask) { - int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int32_t stride = blockDim.x * gridDim.x; + const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; // Processes one overlap per iteration, // "i" corresponds to an overlap @@ -181,11 +181,11 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, } } -__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, +__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, Overlap* overlaps, - double* scores, - bool* max_select_mask, - int32_t* predecessors, + double* const scores, + bool* const max_select_mask, + int32_t* const predecessors, const int32_t n_anchors, const int32_t min_score) { diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index df3ae09b1..12f75b6de 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -51,11 +51,11 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, /// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain. /// \param n_anchors The number of anchors. /// \param min_score The minimum score of a chain for performing backtracing. -__global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, - Overlap* overlaps, - double* scores, - bool* max_select_mask, - int32_t* predecessors, +__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, + Overlap* const overlaps, + double* const scores, + bool* const max_select_mask, + int32_t* const predecessors, const int32_t n_anchors, const int32_t min_score); @@ -74,7 +74,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* anchors, void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - int32_t num_overlaps, + const int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator& _allocator, cudaStream_t& _cuda_stream); @@ -90,14 +90,14 @@ void allocate_anchor_chains(device_buffer& overlaps, /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array. /// \param num_overlaps The number of overlaps in the overlaps array /// \param check_mask A boolean. If true, only overlaps where select_mask is true will have their anchor chains calculated. -__global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, - const Anchor* anchors, - const bool* select_mask, - const int32_t* predecessors, - int32_t* anchor_chains, - int32_t* anchor_chain_starts, - int32_t num_overlaps, - bool check_mask); +__global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps, + const Anchor* const anchors, + const bool* const select_mask, + const int32_t* const predecessors, + int32_t* const anchor_chains, + int32_t* const anchor_chain_starts, + const int32_t num_overlaps, + const bool check_mask); /// \brief Calculate the anchors chains used to produce each overlap in the /// overlap array for anchors chained by RLE. @@ -109,13 +109,13 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* overlaps, /// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain. /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array. /// \param num_overlaps The number of overlaps in the overlaps array. -__global__ void output_overlap_chains_by_RLE(const Overlap* overlaps, - const Anchor* anchors, - const int32_t* chain_starts, - const int32_t* chain_lengths, - int32_t* anchor_chains, - int32_t* anchor_chain_starts, - int32_t num_overlaps); +__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, + const Anchor* const anchors, + const int32_t* const chain_starts, + const int32_t* const chain_lengths, + int32_t* const anchor_chains, + int32_t* const anchor_chain_starts, + const int32_t num_overlaps); } // namespace chainerutils } // namespace cudamapper From 7680371b0f4708d0b43ab05f2e028ed006f14083 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 26 Oct 2020 21:52:49 -0400 Subject: [PATCH 12/26] [chainer_utils] Pass cudastream and allocation by value (not reference) and fix docs issues brought up in review. --- cudamapper/src/chainer_utils.cu | 18 +++++++++--------- cudamapper/src/chainer_utils.cuh | 17 +++++++++-------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 3280f6501..208538f0f 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -70,24 +70,24 @@ void allocate_anchor_chains(device_buffer& overlaps, device_buffer& anchor_chain_starts, const int32_t num_overlaps, int32_t& num_total_anchors, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream) + DefaultDeviceAllocator allocator, + cudaStream_t cuda_stream) { // sum the number of chains across all overlaps - device_buffer d_temp_buf(_allocator, _cuda_stream); + device_buffer d_temp_buf(allocator, cuda_stream); void* d_temp_storage = nullptr; std::size_t temp_storage_bytes = 0; OverlapToNumResiduesOp overlap_residue_count_op; cub::TransformInputIterator d_residue_counts(overlaps.data(), overlap_residue_count_op); - device_buffer d_num_total_anchors(1, _allocator, _cuda_stream); + device_buffer d_num_total_anchors(1, allocator, cuda_stream); cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_residue_counts, d_num_total_anchors.data(), num_overlaps, - _cuda_stream); + cuda_stream); d_temp_buf.clear_and_resize(temp_storage_bytes); d_temp_storage = d_temp_buf.data(); @@ -97,12 +97,12 @@ void allocate_anchor_chains(device_buffer& overlaps, d_residue_counts, d_num_total_anchors.data(), num_overlaps, - _cuda_stream); + cuda_stream); d_temp_storage = nullptr; temp_storage_bytes = 0; - num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), _cuda_stream); + num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), cuda_stream); unrolled_anchor_chains.clear_and_resize(num_total_anchors); anchor_chain_starts.clear_and_resize(num_overlaps); @@ -112,7 +112,7 @@ void allocate_anchor_chains(device_buffer& overlaps, d_residue_counts, anchor_chain_starts.data(), num_overlaps, - _cuda_stream); + cuda_stream); d_temp_buf.clear_and_resize(temp_storage_bytes); d_temp_storage = d_temp_buf.data(); @@ -122,7 +122,7 @@ void allocate_anchor_chains(device_buffer& overlaps, d_residue_counts, anchor_chain_starts.data(), num_overlaps, - _cuda_stream); + cuda_stream); } __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 12f75b6de..c9c331c2b 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -40,9 +40,10 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors); -/// \brief Perform a backtrace on the predecessors array -/// to generate overlaps from anchors. Anchors must have been chained -/// by a chaining function that filles the predecessors and scores array. +/// \brief Produce an array of overlaps by iterating +/// through the predecessors of each anchor within a chain, +/// until an anchor with no predecessor is reached. Anchors must have been chained +/// by a chaining function that fills the predecessors and scores array. /// /// \param anchors An array of anchors. /// \param overlaps An array of overlaps to be filled. @@ -65,19 +66,19 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, /// of the anchors in the chain. /// /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field -/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return. +/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return. /// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap. /// \param num_overlaps The number of overlaps in the overlaps array. /// \param num_total_anchors The number of anchors in the anchors array. -/// \param _allocator The DefaultDeviceAllocator for this overlapper. -/// \param _cuda_stream The cudastream to allocate memory within. +/// \param allocator The DefaultDeviceAllocator for this overlapper. +/// \param cuda_stream The cudastream to allocate memory within. void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, const int32_t num_overlaps, int32_t& num_total_anchors, - DefaultDeviceAllocator& _allocator, - cudaStream_t& _cuda_stream); + DefaultDeviceAllocator allocator, + cudaStream_t cuda_stream); /// \brief Calculate the anchors chains used to produce each overlap in the /// overlap array for anchors chained by backtrace_anchors_to_overlaps. From 78da54195e1d117af4d7399b7fd222304a6b9316 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Tue, 27 Oct 2020 09:48:37 -0400 Subject: [PATCH 13/26] [chainer_utils] Address review comments. --- cudamapper/src/chainer_utils.cu | 123 +++++++++--------- cudamapper/src/chainer_utils.cuh | 1 - .../tests/Test_CudamapperChainerUtils.cu | 1 - 3 files changed, 61 insertions(+), 64 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 208538f0f..b3e39af13 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -65,10 +65,51 @@ __host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anc return overlap; } +__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, + Overlap* overlaps, + double* const scores, + bool* const max_select_mask, + int32_t* const predecessors, + const int32_t n_anchors, + const int32_t min_score) +{ + const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x; + if (d_tid < n_anchors) + { + + int32_t global_overlap_index = d_tid; + if (scores[d_tid] >= min_score) + { + + int32_t index = global_overlap_index; + int32_t first_index = index; + int32_t num_anchors_in_chain = 0; + Anchor final_anchor = anchors[global_overlap_index]; + + while (index != -1) + { + first_index = index; + int32_t pred = predecessors[index]; + if (pred != -1) + { + max_select_mask[pred] = false; + } + num_anchors_in_chain++; + index = predecessors[index]; + } + Anchor first_anchor = anchors[first_index]; + overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain); + } + else + { + max_select_mask[global_overlap_index] = false; + } + } +} + void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - const int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator allocator, cudaStream_t cuda_stream) @@ -86,7 +127,7 @@ void allocate_anchor_chains(device_buffer& overlaps, temp_storage_bytes, d_residue_counts, d_num_total_anchors.data(), - num_overlaps, + overlaps.size(), cuda_stream); d_temp_buf.clear_and_resize(temp_storage_bytes); @@ -96,7 +137,7 @@ void allocate_anchor_chains(device_buffer& overlaps, temp_storage_bytes, d_residue_counts, d_num_total_anchors.data(), - num_overlaps, + overlaps.size(), cuda_stream); d_temp_storage = nullptr; @@ -105,13 +146,13 @@ void allocate_anchor_chains(device_buffer& overlaps, num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), cuda_stream); unrolled_anchor_chains.clear_and_resize(num_total_anchors); - anchor_chain_starts.clear_and_resize(num_overlaps); + anchor_chain_starts.clear_and_resize(overlaps.size()); cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_residue_counts, anchor_chain_starts.data(), - num_overlaps, + overlaps.size(), cuda_stream); d_temp_buf.clear_and_resize(temp_storage_bytes); @@ -121,31 +162,10 @@ void allocate_anchor_chains(device_buffer& overlaps, temp_storage_bytes, d_residue_counts, anchor_chain_starts.data(), - num_overlaps, + overlaps.size(), cuda_stream); } -__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, - const Anchor* const anchors, - const int32_t* const chain_starts, - const int32_t* const chain_lengths, - int32_t* const anchor_chains, - int32_t* const anchor_chain_starts, - const int32_t num_overlaps) -{ - const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - const int32_t stride = blockDim.x * gridDim.x; - for (int i = d_thread_id; i < num_overlaps; i += stride) - { - int32_t chain_start = chain_starts[i]; - int32_t chain_length = chain_lengths[i]; - for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i) - { - anchor_chains[ind] = ind; - } - } -} - __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps, const Anchor* const anchors, const bool* const select_mask, @@ -181,44 +201,23 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps } } -__global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, - Overlap* overlaps, - double* const scores, - bool* const max_select_mask, - int32_t* const predecessors, - const int32_t n_anchors, - const int32_t min_score) +__global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, + const Anchor* const anchors, + const int32_t* const chain_starts, + const int32_t* const chain_lengths, + int32_t* const anchor_chains, + int32_t* const anchor_chain_starts, + const int32_t num_overlaps) { - const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x; - if (d_tid < n_anchors) + const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; + for (int i = d_thread_id; i < num_overlaps; i += stride) { - - int32_t global_overlap_index = d_tid; - if (scores[d_tid] >= min_score) - { - - int32_t index = global_overlap_index; - int32_t first_index = index; - int32_t num_anchors_in_chain = 0; - Anchor final_anchor = anchors[global_overlap_index]; - - while (index != -1) - { - first_index = index; - int32_t pred = predecessors[index]; - if (pred != -1) - { - max_select_mask[pred] = false; - } - num_anchors_in_chain++; - index = predecessors[index]; - } - Anchor first_anchor = anchors[first_index]; - overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain); - } - else + int32_t chain_start = chain_starts[i]; + int32_t chain_length = chain_lengths[i]; + for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i) { - max_select_mask[global_overlap_index] = false; + anchor_chains[ind] = ind; } } } diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index c9c331c2b..8735a7bbe 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -75,7 +75,6 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, void allocate_anchor_chains(device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - const int32_t num_overlaps, int32_t& num_total_anchors, DefaultDeviceAllocator allocator, cudaStream_t cuda_stream); diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index 666f3f459..9f1da5914 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -118,7 +118,6 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) chainerutils::allocate_anchor_chains(d_overlaps, unrolled_anchor_chains, chain_starts, - overlaps.size(), num_total_anchors, allocator, cu_ptr); From 44f2f4dbecbbaeffe145c0c455fcc0ac2dcb4fad Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Tue, 27 Oct 2020 09:58:28 -0400 Subject: [PATCH 14/26] [chainer_utils] Refactor backtrace_anchors_to_overlaps to be a grid-stride function. --- cudamapper/src/chainer_utils.cu | 28 +++++++++++++--------------- cudamapper/src/chainer_utils.cuh | 6 +++--- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index b3e39af13..c2d23d58d 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -73,18 +73,16 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, const int32_t n_anchors, const int32_t min_score) { - const std::size_t d_tid = blockIdx.x * blockDim.x + threadIdx.x; - if (d_tid < n_anchors) + const std::size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; + for (int i = tid; i < n_anchors; i += stride) { - - int32_t global_overlap_index = d_tid; - if (scores[d_tid] >= min_score) + if (scores[i] >= min_score) { - - int32_t index = global_overlap_index; + int32_t index = i; int32_t first_index = index; int32_t num_anchors_in_chain = 0; - Anchor final_anchor = anchors[global_overlap_index]; + Anchor final_anchor = anchors[i]; while (index != -1) { @@ -97,17 +95,17 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, num_anchors_in_chain++; index = predecessors[index]; } - Anchor first_anchor = anchors[first_index]; - overlaps[global_overlap_index] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain); + Anchor first_anchor = anchors[first_index]; + overlaps[i] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain); } else { - max_select_mask[global_overlap_index] = false; + max_select_mask[i] = false; } } } -void allocate_anchor_chains(device_buffer& overlaps, +void allocate_anchor_chains(const device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, int32_t& num_total_anchors, @@ -119,7 +117,7 @@ void allocate_anchor_chains(device_buffer& overlaps, void* d_temp_storage = nullptr; std::size_t temp_storage_bytes = 0; OverlapToNumResiduesOp overlap_residue_count_op; - cub::TransformInputIterator d_residue_counts(overlaps.data(), overlap_residue_count_op); + cub::TransformInputIterator d_residue_counts(overlaps.data(), overlap_residue_count_op); device_buffer d_num_total_anchors(1, allocator, cuda_stream); @@ -207,11 +205,11 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, const int32_t* const chain_lengths, int32_t* const anchor_chains, int32_t* const anchor_chain_starts, - const int32_t num_overlaps) + const uint32_t num_overlaps) { const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; const int32_t stride = blockDim.x * gridDim.x; - for (int i = d_thread_id; i < num_overlaps; i += stride) + for (uint32_t i = d_thread_id; i < num_overlaps; i += stride) { int32_t chain_start = chain_starts[i]; int32_t chain_length = chain_lengths[i]; diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 8735a7bbe..12ad64fe2 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -72,12 +72,12 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, /// \param num_total_anchors The number of anchors in the anchors array. /// \param allocator The DefaultDeviceAllocator for this overlapper. /// \param cuda_stream The cudastream to allocate memory within. -void allocate_anchor_chains(device_buffer& overlaps, +void allocate_anchor_chains(const device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, int32_t& num_total_anchors, DefaultDeviceAllocator allocator, - cudaStream_t cuda_stream); + cudaStream_t cuda_stream = 0); /// \brief Calculate the anchors chains used to produce each overlap in the /// overlap array for anchors chained by backtrace_anchors_to_overlaps. @@ -115,7 +115,7 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, const int32_t* const chain_lengths, int32_t* const anchor_chains, int32_t* const anchor_chain_starts, - const int32_t num_overlaps); + const uint32_t num_overlaps); } // namespace chainerutils } // namespace cudamapper From be3b0a4483bc7857ca5ffccbc19a3bc5737f1d2f Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Tue, 27 Oct 2020 23:34:26 -0400 Subject: [PATCH 15/26] [chainer_utils] Fix bugs in tests caused by not specifying cudastream when instantiating device arrays. --- cudamapper/tests/Test_CudamapperChainerUtils.cu | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index 9f1da5914..560e94432 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -69,8 +69,7 @@ TEST(TestChainerUtils, Create_Simple_Overlap_Tests) TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) { - - DefaultDeviceAllocator allocator = create_default_device_allocator(2048); + DefaultDeviceAllocator allocator = create_default_device_allocator(); CudaStream cuda_stream = make_cuda_stream(); auto cu_ptr = cuda_stream.get(); @@ -106,15 +105,15 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) overlaps.push_back(p_ab); overlaps.push_back(p_bc); - device_buffer d_anchors(anchors.size(), allocator, cuda_stream.get()); - device_buffer d_overlaps(overlaps.size(), allocator, cuda_stream.get()); - cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data()); - cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data()); - - device_buffer unrolled_anchor_chains; - device_buffer chain_starts; + device_buffer d_anchors(anchors.size(), allocator, cu_ptr); + device_buffer d_overlaps(overlaps.size(), allocator, cu_ptr); + cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data(), cu_ptr); + cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data(), cu_ptr); + device_buffer unrolled_anchor_chains(0, allocator, cu_ptr); + device_buffer chain_starts(0, allocator, cu_ptr); int32_t num_total_anchors; + chainerutils::allocate_anchor_chains(d_overlaps, unrolled_anchor_chains, chain_starts, From 9f75411a491f79a6a5dda56eb4cf68a95910042b Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Wed, 28 Oct 2020 10:26:12 -0400 Subject: [PATCH 16/26] [chainer_utils] Refactor device memory from d_* convention to *_d. --- cudamapper/src/chainer_utils.cu | 52 +++++++++---------- .../tests/Test_CudamapperChainerUtils.cu | 10 ++-- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index c2d23d58d..fa956a861 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -113,52 +113,52 @@ void allocate_anchor_chains(const device_buffer& overlaps, cudaStream_t cuda_stream) { // sum the number of chains across all overlaps - device_buffer d_temp_buf(allocator, cuda_stream); - void* d_temp_storage = nullptr; + device_buffer temp_buf_d(allocator, cuda_stream); + void* temp_storage_d = nullptr; std::size_t temp_storage_bytes = 0; OverlapToNumResiduesOp overlap_residue_count_op; - cub::TransformInputIterator d_residue_counts(overlaps.data(), overlap_residue_count_op); + cub::TransformInputIterator residue_counts_d(overlaps.data(), overlap_residue_count_op); - device_buffer d_num_total_anchors(1, allocator, cuda_stream); + device_buffer num_total_anchors_d(1, allocator, cuda_stream); - cub::DeviceReduce::Sum(d_temp_storage, + cub::DeviceReduce::Sum(temp_storage_d, temp_storage_bytes, - d_residue_counts, - d_num_total_anchors.data(), + residue_counts_d, + num_total_anchors_d.data(), overlaps.size(), cuda_stream); - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); + temp_buf_d.clear_and_resize(temp_storage_bytes); + temp_storage_d = temp_buf_d.data(); - cub::DeviceReduce::Sum(d_temp_storage, + cub::DeviceReduce::Sum(temp_storage_d, temp_storage_bytes, - d_residue_counts, - d_num_total_anchors.data(), + residue_counts_d, + num_total_anchors_d.data(), overlaps.size(), cuda_stream); - d_temp_storage = nullptr; + temp_storage_d = nullptr; temp_storage_bytes = 0; - num_total_anchors = cudautils::get_value_from_device(d_num_total_anchors.data(), cuda_stream); + num_total_anchors = cudautils::get_value_from_device(num_total_anchors_d.data(), cuda_stream); unrolled_anchor_chains.clear_and_resize(num_total_anchors); anchor_chain_starts.clear_and_resize(overlaps.size()); - cub::DeviceScan::ExclusiveSum(d_temp_storage, + cub::DeviceScan::ExclusiveSum(temp_storage_d, temp_storage_bytes, - d_residue_counts, + residue_counts_d, anchor_chain_starts.data(), overlaps.size(), cuda_stream); - d_temp_buf.clear_and_resize(temp_storage_bytes); - d_temp_storage = d_temp_buf.data(); + temp_buf_d.clear_and_resize(temp_storage_bytes); + temp_storage_d = temp_buf_d.data(); - cub::DeviceScan::ExclusiveSum(d_temp_storage, + cub::DeviceScan::ExclusiveSum(temp_storage_d, temp_storage_bytes, - d_residue_counts, + residue_counts_d, anchor_chain_starts.data(), overlaps.size(), cuda_stream); @@ -173,12 +173,12 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps const int32_t num_overlaps, const bool check_mask) { - const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - const int32_t stride = blockDim.x * gridDim.x; + const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; // Processes one overlap per iteration, // "i" corresponds to an overlap - for (int i = d_thread_id; i < num_overlaps; i += stride) + for (int i = thread_id; i < num_overlaps; i += stride) { // index within this chain of anchors (i.e., the anchors within a single overlap) @@ -207,9 +207,9 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, int32_t* const anchor_chain_starts, const uint32_t num_overlaps) { - const int32_t d_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - const int32_t stride = blockDim.x * gridDim.x; - for (uint32_t i = d_thread_id; i < num_overlaps; i += stride) + const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; + for (uint32_t i = thread_id; i < num_overlaps; i += stride) { int32_t chain_start = chain_starts[i]; int32_t chain_length = chain_lengths[i]; diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index 560e94432..609bdd79a 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -105,16 +105,16 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) overlaps.push_back(p_ab); overlaps.push_back(p_bc); - device_buffer d_anchors(anchors.size(), allocator, cu_ptr); - device_buffer d_overlaps(overlaps.size(), allocator, cu_ptr); - cudautils::device_copy_n(anchors.data(), anchors.size(), d_anchors.data(), cu_ptr); - cudautils::device_copy_n(overlaps.data(), overlaps.size(), d_overlaps.data(), cu_ptr); + device_buffer anchors_d(anchors.size(), allocator, cu_ptr); + device_buffer overlaps_d(overlaps.size(), allocator, cu_ptr); + cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), cu_ptr); + cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), cu_ptr); device_buffer unrolled_anchor_chains(0, allocator, cu_ptr); device_buffer chain_starts(0, allocator, cu_ptr); int32_t num_total_anchors; - chainerutils::allocate_anchor_chains(d_overlaps, + chainerutils::allocate_anchor_chains(overlaps_d, unrolled_anchor_chains, chain_starts, num_total_anchors, From c72e5c966bae243de66f04e1cc4690d41e09963e Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 2 Nov 2020 10:26:57 -0500 Subject: [PATCH 17/26] [chainerutils] Rename create_simple_overlap to create_overlap --- cudamapper/src/chainer_utils.cu | 4 ++-- cudamapper/src/chainer_utils.cuh | 2 +- cudamapper/tests/Test_CudamapperChainerUtils.cu | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index fa956a861..89875591d 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -38,7 +38,7 @@ struct OverlapToNumResiduesOp } }; -__host__ __device__ Overlap create_simple_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) +__host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) { Overlap overlap; overlap.num_residues_ = num_anchors; @@ -96,7 +96,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, index = predecessors[index]; } Anchor first_anchor = anchors[first_index]; - overlaps[i] = create_simple_overlap(first_anchor, final_anchor, num_anchors_in_chain); + overlaps[i] = create_overlap(first_anchor, final_anchor, num_anchors_in_chain); } else { diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 12ad64fe2..bf8752da3 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -36,7 +36,7 @@ namespace chainerutils /// \param start The first anchor in the chain. /// \param end The last anchor in the chain. /// \param num_anchors The total number of anchors in the chain. -__host__ __device__ Overlap create_simple_overlap(const Anchor& start, +__host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors); diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index 609bdd79a..3638dd92b 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -32,7 +32,7 @@ namespace genomeworks namespace cudamapper { -TEST(TestChainerUtils, Create_Simple_Overlap_Tests) +TEST(TestChainerUtils, Create_Overlap_Tests) { Anchor a; a.query_read_id_ = 12; @@ -46,7 +46,7 @@ TEST(TestChainerUtils, Create_Simple_Overlap_Tests) b.target_read_id_ = 46; b.target_position_in_read_ = 20400; - Overlap p_ab = chainerutils::create_simple_overlap(a, b, 12); + Overlap p_ab = chainerutils::create_overlap(a, b, 12); ASSERT_EQ(p_ab.query_read_id_, static_cast(12)); ASSERT_EQ(p_ab.target_read_id_, 46); ASSERT_EQ(p_ab.query_start_position_in_read_, 130); @@ -61,7 +61,7 @@ TEST(TestChainerUtils, Create_Simple_Overlap_Tests) c.target_read_id_ = 46; c.target_position_in_read_ = 16000; - Overlap p_bc = chainerutils::create_simple_overlap(b, c, 22); + Overlap p_bc = chainerutils::create_overlap(b, c, 22); ASSERT_EQ(p_bc.target_start_position_in_read_, 16000); ASSERT_EQ(p_bc.target_end_position_in_read_, 20400); ASSERT_EQ(static_cast(p_bc.relative_strand), static_cast(RelativeStrand::Reverse)); @@ -86,7 +86,7 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) b.target_read_id_ = 46; b.target_position_in_read_ = 20400; - Overlap p_ab = chainerutils::create_simple_overlap(a, b, 2); + Overlap p_ab = chainerutils::create_overlap(a, b, 2); Anchor c; c.query_read_id_ = 12; @@ -94,7 +94,7 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) c.target_read_id_ = 46; c.target_position_in_read_ = 16000; - Overlap p_bc = chainerutils::create_simple_overlap(b, c, 2); + Overlap p_bc = chainerutils::create_overlap(b, c, 2); std::vector anchors; anchors.push_back(a); From 6df3b764931864fc0a979202137471118f26d5b6 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 2 Nov 2020 10:32:00 -0500 Subject: [PATCH 18/26] [chainerutils] Move declaration of temporary memory used by CUB. --- cudamapper/src/chainer_utils.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 89875591d..0cdb13ad7 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -113,11 +113,12 @@ void allocate_anchor_chains(const device_buffer& overlaps, cudaStream_t cuda_stream) { // sum the number of chains across all overlaps - device_buffer temp_buf_d(allocator, cuda_stream); + void* temp_storage_d = nullptr; std::size_t temp_storage_bytes = 0; OverlapToNumResiduesOp overlap_residue_count_op; - cub::TransformInputIterator residue_counts_d(overlaps.data(), overlap_residue_count_op); + cub::TransformInputIterator residue_counts_d(overlaps.data(), + overlap_residue_count_op); device_buffer num_total_anchors_d(1, allocator, cuda_stream); @@ -128,7 +129,7 @@ void allocate_anchor_chains(const device_buffer& overlaps, overlaps.size(), cuda_stream); - temp_buf_d.clear_and_resize(temp_storage_bytes); + device_buffer temp_buf_d(temp_storage_bytes, allocator, cuda_stream); temp_storage_d = temp_buf_d.data(); cub::DeviceReduce::Sum(temp_storage_d, From 781c2fd201ed96a4027a929fb6ed3b73d95cb3ce Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 2 Nov 2020 10:52:44 -0500 Subject: [PATCH 19/26] [chainerutils] Use 64bit ints for the number of anchors. --- cudamapper/src/chainer_utils.cu | 6 +++--- cudamapper/src/chainer_utils.cuh | 8 ++++---- cudamapper/tests/Test_CudamapperChainerUtils.cu | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 0cdb13ad7..bb7231877 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -70,7 +70,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, double* const scores, bool* const max_select_mask, int32_t* const predecessors, - const int32_t n_anchors, + const int64_t n_anchors, const int32_t min_score) { const std::size_t tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -108,7 +108,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, void allocate_anchor_chains(const device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - int32_t& num_total_anchors, + int64_t& num_total_anchors, DefaultDeviceAllocator allocator, cudaStream_t cuda_stream) { @@ -120,7 +120,7 @@ void allocate_anchor_chains(const device_buffer& overlaps, cub::TransformInputIterator residue_counts_d(overlaps.data(), overlap_residue_count_op); - device_buffer num_total_anchors_d(1, allocator, cuda_stream); + device_buffer num_total_anchors_d(1, allocator, cuda_stream); cub::DeviceReduce::Sum(temp_storage_d, temp_storage_bytes, diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index bf8752da3..14af4b21a 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -37,8 +37,8 @@ namespace chainerutils /// \param end The last anchor in the chain. /// \param num_anchors The total number of anchors in the chain. __host__ __device__ Overlap create_overlap(const Anchor& start, - const Anchor& end, - const int32_t num_anchors); + const Anchor& end, + const int32_t num_anchors); /// \brief Produce an array of overlaps by iterating /// through the predecessors of each anchor within a chain, @@ -57,7 +57,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, double* const scores, bool* const max_select_mask, int32_t* const predecessors, - const int32_t n_anchors, + const int64_t n_anchors, const int32_t min_score); /// \brief Allocate a 1-dimensional array representing an unrolled 2D-array @@ -75,7 +75,7 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, void allocate_anchor_chains(const device_buffer& overlaps, device_buffer& unrolled_anchor_chains, device_buffer& anchor_chain_starts, - int32_t& num_total_anchors, + int64_t& num_total_anchors, DefaultDeviceAllocator allocator, cudaStream_t cuda_stream = 0); diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index 3638dd92b..af95802dc 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -112,7 +112,7 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) device_buffer unrolled_anchor_chains(0, allocator, cu_ptr); device_buffer chain_starts(0, allocator, cu_ptr); - int32_t num_total_anchors; + int64_t num_total_anchors; chainerutils::allocate_anchor_chains(overlaps_d, unrolled_anchor_chains, From bd07138da853b3aa9b5f84c94093f4401e38b38a Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 2 Nov 2020 12:07:02 -0500 Subject: [PATCH 20/26] [chainerutils] Attempt to clean up documentation. --- cudamapper/src/chainer_utils.cu | 5 +++-- cudamapper/src/chainer_utils.cuh | 22 ++++++++++------------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index bb7231877..d6bd962c9 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -73,8 +73,9 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, const int64_t n_anchors, const int32_t min_score) { - const std::size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const int32_t stride = blockDim.x * gridDim.x; + const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; + for (int i = tid; i < n_anchors; i += stride) { if (scores[i] >= min_score) diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 14af4b21a..f7488eb78 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -40,10 +40,8 @@ __host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors); -/// \brief Produce an array of overlaps by iterating -/// through the predecessors of each anchor within a chain, -/// until an anchor with no predecessor is reached. Anchors must have been chained -/// by a chaining function that fills the predecessors and scores array. +/// \brief Given an array of anchors and an array denoting the immediate +/// predecessor of each anchor, transform chains of anchors into overlaps. /// /// \param anchors An array of anchors. /// \param overlaps An array of overlaps to be filled. @@ -60,13 +58,12 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, const int64_t n_anchors, const int32_t min_score); -/// \brief Allocate a 1-dimensional array representing an unrolled 2D-array -/// (overlap X n_anchors_in_overlap) of anchors within each overlap. Rather than -/// copy the anchors, the final array holds the indices within the anchors array +/// \brief Allocate a 1-dimensional array representing an unrolled ragged array +/// of anchors within each overlap. The final array holds the indices within the anchors array /// of the anchors in the chain. /// /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field -/// \param unrolled_anchor_chains An array of int32_t. Will be resided on return. +/// \param unrolled_anchor_chains An array of int32_t. Will be resized on return. /// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap. /// \param num_overlaps The number of overlaps in the overlaps array. /// \param num_total_anchors The number of anchors in the anchors array. @@ -79,8 +76,10 @@ void allocate_anchor_chains(const device_buffer& overlaps, DefaultDeviceAllocator allocator, cudaStream_t cuda_stream = 0); -/// \brief Calculate the anchors chains used to produce each overlap in the -/// overlap array for anchors chained by backtrace_anchors_to_overlaps. +/// \brief Given an array of overlaps, fill a 1D unrolled ragged array +/// containing the anchors used to generate each overlap. Anchors must have +/// been chained with a chaining function that fills the predecessors array +/// with the immediate predecessor of each anchor. /// /// \param overlaps An array of overlaps. /// \param anchors The array of anchors used to generate overlaps. @@ -99,8 +98,7 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps const int32_t num_overlaps, const bool check_mask); -/// \brief Calculate the anchors chains used to produce each overlap in the -/// overlap array for anchors chained by RLE. +/// \brief Fill a 1D unrolled ragged array with the anchors used to produce each overlap. /// /// \param overlaps An array of overlaps. /// \param anchors The array of anchors used to generate overlaps. From 178b075fb37f65df4ad67877c835503d6efcee03 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 2 Nov 2020 19:16:23 -0500 Subject: [PATCH 21/26] Remove unnecessary assignment in chainerutils. --- cudamapper/src/chainer_utils.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index d6bd962c9..1f68d325d 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -193,8 +193,7 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps while (index != -1) { anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index)] = index; - int32_t pred = predecessors[index]; - index = pred; + index = predecessors[index]; ++anchor_chain_index; } } From 6244636dab64dc9dc845b8c743642cc9ab9f6cc1 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 2 Nov 2020 20:39:59 -0500 Subject: [PATCH 22/26] [chainerutils] Refactor allocate_anchor_chains to use thrust, rather than CUB. --- cudamapper/src/chainer_utils.cu | 67 ++++++------------- cudamapper/src/overlapper_triggered.cu | 5 -- .../tests/Test_CudamapperChainerUtils.cu | 9 ++- 3 files changed, 28 insertions(+), 53 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 1f68d325d..5d6c135bf 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -17,6 +17,9 @@ #include "chainer_utils.cuh" #include +#include +#include +#include #include namespace claraparabricks @@ -38,6 +41,14 @@ struct OverlapToNumResiduesOp } }; +struct ConvertOverlapToNumResidues : public thrust::unary_function +{ + __host__ __device__ int32_t operator()(const Overlap& o) const + { + return o.num_residues_; + } +}; + __host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& end, const int32_t num_anchors) { Overlap overlap; @@ -113,57 +124,19 @@ void allocate_anchor_chains(const device_buffer& overlaps, DefaultDeviceAllocator allocator, cudaStream_t cuda_stream) { - // sum the number of chains across all overlaps - - void* temp_storage_d = nullptr; - std::size_t temp_storage_bytes = 0; - OverlapToNumResiduesOp overlap_residue_count_op; - cub::TransformInputIterator residue_counts_d(overlaps.data(), - overlap_residue_count_op); - - device_buffer num_total_anchors_d(1, allocator, cuda_stream); - - cub::DeviceReduce::Sum(temp_storage_d, - temp_storage_bytes, - residue_counts_d, - num_total_anchors_d.data(), - overlaps.size(), - cuda_stream); - - device_buffer temp_buf_d(temp_storage_bytes, allocator, cuda_stream); - temp_storage_d = temp_buf_d.data(); - - cub::DeviceReduce::Sum(temp_storage_d, - temp_storage_bytes, - residue_counts_d, - num_total_anchors_d.data(), - overlaps.size(), - cuda_stream); - - temp_storage_d = nullptr; - temp_storage_bytes = 0; - - num_total_anchors = cudautils::get_value_from_device(num_total_anchors_d.data(), cuda_stream); + auto thrust_exec_policy = thrust::cuda::par(allocator).on(cuda_stream); + num_total_anchors = thrust::reduce(thrust_exec_policy, + thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()), + thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()), + 0); unrolled_anchor_chains.clear_and_resize(num_total_anchors); anchor_chain_starts.clear_and_resize(overlaps.size()); - cub::DeviceScan::ExclusiveSum(temp_storage_d, - temp_storage_bytes, - residue_counts_d, - anchor_chain_starts.data(), - overlaps.size(), - cuda_stream); - - temp_buf_d.clear_and_resize(temp_storage_bytes); - temp_storage_d = temp_buf_d.data(); - - cub::DeviceScan::ExclusiveSum(temp_storage_d, - temp_storage_bytes, - residue_counts_d, - anchor_chain_starts.data(), - overlaps.size(), - cuda_stream); + thrust::exclusive_scan(thrust_exec_policy, + thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()), + thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()), + anchor_chain_starts.data()); } __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps, diff --git a/cudamapper/src/overlapper_triggered.cu b/cudamapper/src/overlapper_triggered.cu index 331ebb936..68ed98160 100644 --- a/cudamapper/src/overlapper_triggered.cu +++ b/cudamapper/src/overlapper_triggered.cu @@ -24,11 +24,6 @@ #include -#ifndef NDEBUG // only needed to check if input is sorted in assert -#include -#include -#endif - namespace claraparabricks { diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index af95802dc..7ae64c78b 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -67,7 +67,7 @@ TEST(TestChainerUtils, Create_Overlap_Tests) ASSERT_EQ(static_cast(p_bc.relative_strand), static_cast(RelativeStrand::Reverse)); } -TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) +TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests) { DefaultDeviceAllocator allocator = create_default_device_allocator(); @@ -120,6 +120,13 @@ TEST(TestChainerUtils, Anchor_Chain_Extraction_Tests) num_total_anchors, allocator, cu_ptr); + + ASSERT_EQ(num_total_anchors, 4); + + std::vector anchors_chain_starts_h(overlaps.size()); + cudautils::device_copy_n(chain_starts.data(), num_total_anchors, anchors_chain_starts_h.data(), cu_ptr); + ASSERT_EQ(anchors_chain_starts_h[0], 0); + ASSERT_EQ(anchors_chain_starts_h[1], 2); } } // namespace cudamapper From d933b4c2707400ef04ca8eadf66162a147b56123 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 2 Nov 2020 21:14:21 -0500 Subject: [PATCH 23/26] [chainerutils] Use a gride-stride function with the block as the unit to process backtraced overlaps. --- cudamapper/src/chainer_utils.cu | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 5d6c135bf..342325831 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -84,11 +84,16 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, const int64_t n_anchors, const int32_t min_score) { - const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const int32_t stride = blockDim.x * gridDim.x; + const int64_t block_id = blockIdx.x; + const int32_t stride = gridDim.x; - for (int i = tid; i < n_anchors; i += stride) + for (int i = block_id; i < n_anchors; i += stride) { + + if (threadIdx.x != 0) + { + return; + } if (scores[i] >= min_score) { int32_t index = i; @@ -148,14 +153,17 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps const int32_t num_overlaps, const bool check_mask) { - const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; - const int32_t stride = blockDim.x * gridDim.x; + const int32_t block_id = blockIdx.x; + const int32_t stride = gridDim.x; // Processes one overlap per iteration, // "i" corresponds to an overlap - for (int i = thread_id; i < num_overlaps; i += stride) + for (int i = block_id; i < num_overlaps; i += stride) { - // index within this chain of anchors (i.e., the anchors within a single overlap) + if (threadIdx.x != 0) + { + return; + } if (!check_mask || (check_mask & select_mask[i])) { From b3102926c6559ca5dc85dfdd5c8d0aa68ace7339 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Tue, 3 Nov 2020 08:33:40 -0500 Subject: [PATCH 24/26] [chainerutils] Convert single-block functions back to single-thread. --- cudamapper/src/chainer_utils.cu | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index 342325831..d4c714e45 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -84,16 +84,11 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, const int64_t n_anchors, const int32_t min_score) { - const int64_t block_id = blockIdx.x; - const int32_t stride = gridDim.x; + const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; - for (int i = block_id; i < n_anchors; i += stride) + for (int i = thread_id; i < n_anchors; i += stride) { - - if (threadIdx.x != 0) - { - return; - } if (scores[i] >= min_score) { int32_t index = i; @@ -153,17 +148,13 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps const int32_t num_overlaps, const bool check_mask) { - const int32_t block_id = blockIdx.x; - const int32_t stride = gridDim.x; + const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t stride = blockDim.x * gridDim.x; // Processes one overlap per iteration, // "i" corresponds to an overlap - for (int i = block_id; i < num_overlaps; i += stride) + for (int i = thread_id; i < num_overlaps; i += stride) { - if (threadIdx.x != 0) - { - return; - } if (!check_mask || (check_mask & select_mask[i])) { From 405ec7e4e0b090a3323da5970c038a219078c5bc Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Fri, 6 Nov 2020 10:29:48 -0500 Subject: [PATCH 25/26] [cudamapper] Fix issues from PR review. - Use thrust transform_reduce and transform_exclusive_scan (rather than make_transform iterator) whe> - Constify pointers where possible. - Remove unnecessary includes. - Tweak docs for backtrace_anchors_to_overlap --- cudaaligner/samples/sample_cudaaligner.cpp | 1 - cudamapper/samples/sample_cudamapper.cpp | 1 - cudamapper/src/chainer_utils.cu | 46 +++++++++++----------- cudamapper/src/chainer_utils.cuh | 8 ++-- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/cudaaligner/samples/sample_cudaaligner.cpp b/cudaaligner/samples/sample_cudaaligner.cpp index 50f62dc6a..f236ae2b3 100644 --- a/cudaaligner/samples/sample_cudaaligner.cpp +++ b/cudaaligner/samples/sample_cudaaligner.cpp @@ -27,7 +27,6 @@ #include #include #include -#include using namespace claraparabricks::genomeworks; using namespace claraparabricks::genomeworks::genomeutils; diff --git a/cudamapper/samples/sample_cudamapper.cpp b/cudamapper/samples/sample_cudamapper.cpp index f31a38829..142439d4b 100644 --- a/cudamapper/samples/sample_cudamapper.cpp +++ b/cudamapper/samples/sample_cudamapper.cpp @@ -29,7 +29,6 @@ #include #include #include -#include #include // define constants. See cudamapper/src/application_parameters.hpp for more. diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index d4c714e45..c116667cf 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -16,9 +16,8 @@ #include "chainer_utils.cuh" -#include #include -#include +#include #include #include @@ -33,14 +32,6 @@ namespace cudamapper namespace chainerutils { -struct OverlapToNumResiduesOp -{ - __device__ __forceinline__ int32_t operator()(const Overlap& overlap) const - { - return overlap.num_residues_; - } -}; - struct ConvertOverlapToNumResidues : public thrust::unary_function { __host__ __device__ int32_t operator()(const Overlap& o) const @@ -77,12 +68,12 @@ __host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& en } __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, - Overlap* overlaps, - double* const scores, + Overlap* const overlaps, + const double* const scores, bool* const max_select_mask, - int32_t* const predecessors, + const int32_t* const predecessors, const int64_t n_anchors, - const int32_t min_score) + const double min_score) { const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; const int32_t stride = blockDim.x * gridDim.x; @@ -125,18 +116,29 @@ void allocate_anchor_chains(const device_buffer& overlaps, cudaStream_t cuda_stream) { auto thrust_exec_policy = thrust::cuda::par(allocator).on(cuda_stream); - num_total_anchors = thrust::reduce(thrust_exec_policy, - thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()), - thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()), - 0); + thrust::plus sum_op; + num_total_anchors = thrust::transform_reduce(thrust_exec_policy, + overlaps.begin(), + overlaps.end(), + ConvertOverlapToNumResidues(), + 0, + sum_op); unrolled_anchor_chains.clear_and_resize(num_total_anchors); anchor_chain_starts.clear_and_resize(overlaps.size()); - thrust::exclusive_scan(thrust_exec_policy, - thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()), - thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()), - anchor_chain_starts.data()); + // thrust::exclusive_scan(thrust_exec_policy, + // thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()), + // thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()), + // anchor_chain_starts.data()); + + thrust::transform_exclusive_scan(thrust_exec_policy, + overlaps.begin(), + overlaps.end(), + anchor_chain_starts.data(), + ConvertOverlapToNumResidues(), + 0, + sum_op); } __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps, diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index f7488eb78..50e184541 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -46,17 +46,17 @@ __host__ __device__ Overlap create_overlap(const Anchor& start, /// \param anchors An array of anchors. /// \param overlaps An array of overlaps to be filled. /// \param scores An array of scores. Only chains with a score greater than min_score will be backtraced. -/// \param max_select_mask A boolean mask, used to mask out any subchains during the backtrace. +/// \param max_select_mask A boolean mask, used to mask out any chains which are completely contained within larger chains during the backtrace. /// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain. /// \param n_anchors The number of anchors. /// \param min_score The minimum score of a chain for performing backtracing. __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, Overlap* const overlaps, - double* const scores, + const double* const scores, bool* const max_select_mask, - int32_t* const predecessors, + const int32_t* const predecessors, const int64_t n_anchors, - const int32_t min_score); + const double min_score); /// \brief Allocate a 1-dimensional array representing an unrolled ragged array /// of anchors within each overlap. The final array holds the indices within the anchors array From 530adfd65e913d03fe7d6e1709786d91a9b4b608 Mon Sep 17 00:00:00 2001 From: "Eric T. Dawson" Date: Mon, 9 Nov 2020 21:57:35 -0500 Subject: [PATCH 26/26] [chainerutils] Address review comments. Primarily, this adds a number of tests for output_overlap_chains_by_RLE, output_overlap_chains_by_backtrace, and backtrace_anchors_to_overlaps. It also fixes a bug where the last anchor position in a chain was not returned from backtrace_anchors_to_overlaps, causing faulty output when later extracting anchor chains. --- cudamapper/src/chainer_utils.cu | 39 ++- cudamapper/src/chainer_utils.cuh | 18 +- cudamapper/src/overlapper_triggered.cu | 5 + .../tests/Test_CudamapperChainerUtils.cu | 278 ++++++++++++++++-- 4 files changed, 299 insertions(+), 41 deletions(-) diff --git a/cudamapper/src/chainer_utils.cu b/cudamapper/src/chainer_utils.cu index c116667cf..af4490652 100644 --- a/cudamapper/src/chainer_utils.cu +++ b/cudamapper/src/chainer_utils.cu @@ -32,6 +32,15 @@ namespace cudamapper namespace chainerutils { +__device__ __forceinline__ Anchor empty_anchor() +{ + Anchor a; + a.query_read_id_ = UINT32_MAX; + a.query_position_in_read_ = UINT32_MAX; + a.target_read_id_ = UINT32_MAX; + a.target_position_in_read_ = UINT32_MAX; + return a; +} struct ConvertOverlapToNumResidues : public thrust::unary_function { __host__ __device__ int32_t operator()(const Overlap& o) const @@ -69,11 +78,12 @@ __host__ __device__ Overlap create_overlap(const Anchor& start, const Anchor& en __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, Overlap* const overlaps, - const double* const scores, + int32_t* const overlap_terminal_anchors, + const float* const scores, bool* const max_select_mask, const int32_t* const predecessors, const int64_t n_anchors, - const double min_score) + const float min_score) { const int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x; const int32_t stride = blockDim.x * gridDim.x; @@ -98,12 +108,15 @@ __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, num_anchors_in_chain++; index = predecessors[index]; } - Anchor first_anchor = anchors[first_index]; - overlaps[i] = create_overlap(first_anchor, final_anchor, num_anchors_in_chain); + Anchor first_anchor = anchors[first_index]; + overlap_terminal_anchors[i] = i; + overlaps[i] = create_overlap(first_anchor, final_anchor, num_anchors_in_chain); } else { - max_select_mask[i] = false; + max_select_mask[i] = false; + overlap_terminal_anchors[i] = -1; + overlaps[i] = create_overlap(empty_anchor(), empty_anchor(), 1); } } } @@ -127,11 +140,6 @@ void allocate_anchor_chains(const device_buffer& overlaps, unrolled_anchor_chains.clear_and_resize(num_total_anchors); anchor_chain_starts.clear_and_resize(overlaps.size()); - // thrust::exclusive_scan(thrust_exec_policy, - // thrust::make_transform_iterator(overlaps.begin(), ConvertOverlapToNumResidues()), - // thrust::make_transform_iterator(overlaps.end(), ConvertOverlapToNumResidues()), - // anchor_chain_starts.data()); - thrust::transform_exclusive_scan(thrust_exec_policy, overlaps.begin(), overlaps.end(), @@ -145,6 +153,7 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps const Anchor* const anchors, const bool* const select_mask, const int32_t* const predecessors, + const int32_t* const chain_terminators, int32_t* const anchor_chains, int32_t* const anchor_chain_starts, const int32_t num_overlaps, @@ -163,11 +172,11 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps int32_t anchor_chain_index = 0; // As chaining proceeds backwards (i.e., it's a backtrace), // we need to fill the new anchor chain array in in reverse order. - int32_t index = anchor_chain_starts[i]; + int32_t index = chain_terminators[i]; while (index != -1) { - anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index)] = index; - index = predecessors[index]; + anchor_chains[anchor_chain_starts[i] + (overlaps[i].num_residues_ - anchor_chain_index - 1)] = index; + index = predecessors[index]; ++anchor_chain_index; } } @@ -188,9 +197,9 @@ __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, { int32_t chain_start = chain_starts[i]; int32_t chain_length = chain_lengths[i]; - for (int32_t ind = chain_start; ind < chain_start + chain_length; ++i) + for (int32_t index = chain_start; index < chain_start + chain_length; ++index) { - anchor_chains[ind] = ind; + anchor_chains[index] = index; } } } diff --git a/cudamapper/src/chainer_utils.cuh b/cudamapper/src/chainer_utils.cuh index 50e184541..d564315fe 100644 --- a/cudamapper/src/chainer_utils.cuh +++ b/cudamapper/src/chainer_utils.cuh @@ -50,23 +50,26 @@ __host__ __device__ Overlap create_overlap(const Anchor& start, /// \param predecessors An array of indices into the anchors array marking the predecessor of each anchor within a chain. /// \param n_anchors The number of anchors. /// \param min_score The minimum score of a chain for performing backtracing. +/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors were allocated. +/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...) __global__ void backtrace_anchors_to_overlaps(const Anchor* const anchors, Overlap* const overlaps, - const double* const scores, + int32_t* const overlap_terminal_anchors, + const float* const scores, bool* const max_select_mask, const int32_t* const predecessors, const int64_t n_anchors, - const double min_score); + const float min_score); /// \brief Allocate a 1-dimensional array representing an unrolled ragged array -/// of anchors within each overlap. The final array holds the indices within the anchors array +/// of anchors within each overlap. The final array holds the indices within the anchors array used in chaining /// of the anchors in the chain. /// /// \param overlaps An array of Overlaps. Must have a well-formed num_residues_ field /// \param unrolled_anchor_chains An array of int32_t. Will be resized on return. -/// \param anchor_chain_starts An array holding the index in the anchors array of the first anchor in an overlap. +/// \param anchor_chain_starts An array holding the index of the first anchor in an overlap within the anchors array used during chaining. /// \param num_overlaps The number of overlaps in the overlaps array. -/// \param num_total_anchors The number of anchors in the anchors array. +/// \param num_total_anchors The total number of anchors across all overlaps. /// \param allocator The DefaultDeviceAllocator for this overlapper. /// \param cuda_stream The cudastream to allocate memory within. void allocate_anchor_chains(const device_buffer& overlaps, @@ -89,10 +92,13 @@ void allocate_anchor_chains(const device_buffer& overlaps, /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array. /// \param num_overlaps The number of overlaps in the overlaps array /// \param check_mask A boolean. If true, only overlaps where select_mask is true will have their anchor chains calculated. +/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors/chains/starts were allocated. +/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...) __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps, const Anchor* const anchors, const bool* const select_mask, const int32_t* const predecessors, + const int32_t* const chain_terminators, int32_t* const anchor_chains, int32_t* const anchor_chain_starts, const int32_t num_overlaps, @@ -107,6 +113,8 @@ __global__ void output_overlap_chains_by_backtrace(const Overlap* const overlaps /// \param anchor_chains An array (allocated by allocate_anchor_chains) which will hold the indices of anchors within each chain. /// \param anchor_chain_starts An array which holds the indices of the first anchor for each overlap in the overlaps array. /// \param num_overlaps The number of overlaps in the overlaps array. +/// Launch configuration: any number of blocks/threads, no dynamic shared memory, cuda_stream must be the same in which anchors/overlaps/scores/mask/predecessors/chains/starts were allocated. +/// example: backtrace_anchors_to_overlaps<<<2048, 32, 0, cuda_stream>>>(...) __global__ void output_overlap_chains_by_RLE(const Overlap* const overlaps, const Anchor* const anchors, const int32_t* const chain_starts, diff --git a/cudamapper/src/overlapper_triggered.cu b/cudamapper/src/overlapper_triggered.cu index 68ed98160..d55f63fd0 100644 --- a/cudamapper/src/overlapper_triggered.cu +++ b/cudamapper/src/overlapper_triggered.cu @@ -24,6 +24,11 @@ #include +#include + +#ifndef NDEBUG +#include +#endif namespace claraparabricks { diff --git a/cudamapper/tests/Test_CudamapperChainerUtils.cu b/cudamapper/tests/Test_CudamapperChainerUtils.cu index 7ae64c78b..71447e17f 100644 --- a/cudamapper/tests/Test_CudamapperChainerUtils.cu +++ b/cudamapper/tests/Test_CudamapperChainerUtils.cu @@ -16,8 +16,6 @@ #include "gtest/gtest.h" -#include -#include #include #include @@ -53,7 +51,9 @@ TEST(TestChainerUtils, Create_Overlap_Tests) ASSERT_EQ(p_ab.query_end_position_in_read_, 10000); ASSERT_EQ(p_ab.target_start_position_in_read_, 10320); ASSERT_EQ(p_ab.num_residues_, 12); - ASSERT_EQ(static_cast(p_ab.relative_strand), static_cast(RelativeStrand::Forward)); + // ASSERT_EQ(static_cast(p_ab.relative_strand), static_cast(RelativeStrand::Forward)); + + ASSERT_EQ(p_ab.relative_strand, RelativeStrand::Forward); Anchor c; c.query_read_id_ = 12; @@ -64,15 +64,92 @@ TEST(TestChainerUtils, Create_Overlap_Tests) Overlap p_bc = chainerutils::create_overlap(b, c, 22); ASSERT_EQ(p_bc.target_start_position_in_read_, 16000); ASSERT_EQ(p_bc.target_end_position_in_read_, 20400); - ASSERT_EQ(static_cast(p_bc.relative_strand), static_cast(RelativeStrand::Reverse)); + ASSERT_EQ(p_bc.relative_strand, RelativeStrand::Reverse); +} + +TEST(TestChainerUtils, Anchor_Backtrace_Tests) +{ + Anchor a; + a.query_read_id_ = 12; + a.query_position_in_read_ = 130; + a.target_read_id_ = 46; + a.target_position_in_read_ = 10320; + + Anchor b; + b.query_read_id_ = 12; + b.query_position_in_read_ = 10000; + b.target_read_id_ = 46; + b.target_position_in_read_ = 20400; + + Anchor c; + c.query_read_id_ = 12; + c.query_position_in_read_ = 15000; + c.target_read_id_ = 46; + c.target_position_in_read_ = 1000; + + std::vector anchors = {a, b, c}; + std::vector scores = {10, 20, 25}; + std::vector predecessors = {-1, 0, 0}; + + DefaultDeviceAllocator allocator = create_default_device_allocator(); + CudaStream backtrace_cuda_stream = make_cuda_stream(); + auto backtrace_cu_stream_ptr = backtrace_cuda_stream.get(); + + device_buffer anchors_d(anchors.size(), allocator, backtrace_cu_stream_ptr); + device_buffer overlaps_d(anchors.size(), allocator, backtrace_cu_stream_ptr); + device_buffer scores_d(anchors.size(), allocator, backtrace_cu_stream_ptr); + device_buffer predecessors_d(anchors.size(), allocator, backtrace_cu_stream_ptr); + device_buffer overlap_terminal_anchors(anchors.size(), allocator, backtrace_cu_stream_ptr); + device_buffer mask_d(anchors.size(), allocator, backtrace_cu_stream_ptr); + + cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), backtrace_cu_stream_ptr); + cudautils::device_copy_n(scores.data(), scores.size(), scores_d.data(), backtrace_cu_stream_ptr); + cudautils::device_copy_n(predecessors.data(), predecessors.size(), predecessors_d.data(), backtrace_cu_stream_ptr); + + chainerutils::backtrace_anchors_to_overlaps<<<64, 32, 0, backtrace_cu_stream_ptr>>>(anchors_d.data(), + overlaps_d.data(), + overlap_terminal_anchors.data(), + scores_d.data(), + mask_d.data(), + predecessors_d.data(), + anchors_d.size(), + 20); + + std::vector overlaps; + overlaps.resize(overlaps_d.size()); + cudautils::device_copy_n(overlaps_d.data(), overlaps_d.size(), overlaps.data(), backtrace_cu_stream_ptr); + std::vector terminals_h; + terminals_h.resize(overlaps.size()); + cudautils::device_copy_n(overlap_terminal_anchors.data(), overlap_terminal_anchors.size(), terminals_h.data(), backtrace_cu_stream_ptr); + ASSERT_EQ(overlaps_d.size(), 3); + + ASSERT_EQ(overlaps[0].num_residues_, 1); + ASSERT_EQ(overlaps[0].target_read_id_, UINT32_MAX); + + ASSERT_EQ(overlaps[1].num_residues_, 2); + ASSERT_EQ(overlaps[1].query_read_id_, 12); + ASSERT_EQ(overlaps[1].query_start_position_in_read_, 130); + ASSERT_EQ(overlaps[1].target_start_position_in_read_, 10320); + ASSERT_EQ(overlaps[1].query_end_position_in_read_, 10000); + ASSERT_EQ(overlaps[1].target_end_position_in_read_, 20400); + ASSERT_EQ(overlaps[1].relative_strand, RelativeStrand::Forward); + + ASSERT_EQ(overlaps[2].num_residues_, 2); + ASSERT_EQ(overlaps[2].query_read_id_, 12); + ASSERT_EQ(overlaps[2].query_end_position_in_read_, 15000); + ASSERT_EQ(overlaps[2].relative_strand, RelativeStrand::Reverse); + + ASSERT_EQ(terminals_h[0], -1); + ASSERT_EQ(terminals_h[1], 1); + ASSERT_EQ(terminals_h[2], 2); } TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests) { DefaultDeviceAllocator allocator = create_default_device_allocator(); - CudaStream cuda_stream = make_cuda_stream(); - auto cu_ptr = cuda_stream.get(); + CudaStream chain_alloc_cuda_stream = make_cuda_stream(); + auto chain_alloc_cu_stream_ptr = chain_alloc_cuda_stream.get(); Anchor a; a.query_read_id_ = 12; @@ -96,22 +173,17 @@ TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests) Overlap p_bc = chainerutils::create_overlap(b, c, 2); - std::vector anchors; - anchors.push_back(a); - anchors.push_back(b); - anchors.push_back(c); + std::vector anchors = {a, b, c}; - std::vector overlaps; - overlaps.push_back(p_ab); - overlaps.push_back(p_bc); + std::vector overlaps = {p_ab, p_bc}; - device_buffer anchors_d(anchors.size(), allocator, cu_ptr); - device_buffer overlaps_d(overlaps.size(), allocator, cu_ptr); - cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), cu_ptr); - cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), cu_ptr); + device_buffer anchors_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr); + device_buffer overlaps_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), chain_alloc_cu_stream_ptr); - device_buffer unrolled_anchor_chains(0, allocator, cu_ptr); - device_buffer chain_starts(0, allocator, cu_ptr); + device_buffer unrolled_anchor_chains(0, allocator, chain_alloc_cu_stream_ptr); + device_buffer chain_starts(0, allocator, chain_alloc_cu_stream_ptr); int64_t num_total_anchors; chainerutils::allocate_anchor_chains(overlaps_d, @@ -119,14 +191,178 @@ TEST(TestChainerUtils, Anchor_Chain_Allocation_Tests) chain_starts, num_total_anchors, allocator, - cu_ptr); + chain_alloc_cu_stream_ptr); ASSERT_EQ(num_total_anchors, 4); std::vector anchors_chain_starts_h(overlaps.size()); - cudautils::device_copy_n(chain_starts.data(), num_total_anchors, anchors_chain_starts_h.data(), cu_ptr); + cudautils::device_copy_n(chain_starts.data(), num_total_anchors, anchors_chain_starts_h.data(), chain_alloc_cu_stream_ptr); ASSERT_EQ(anchors_chain_starts_h[0], 0); ASSERT_EQ(anchors_chain_starts_h[1], 2); + ASSERT_EQ(num_total_anchors, 4); +} + +TEST(TestChainerUtils, Test_Output_Overlap_Chains_By_RLE) +{ + DefaultDeviceAllocator allocator = create_default_device_allocator(); + + CudaStream chain_alloc_cuda_stream = make_cuda_stream(); + auto chain_alloc_cu_stream_ptr = chain_alloc_cuda_stream.get(); + + Anchor a; + a.query_read_id_ = 12; + a.query_position_in_read_ = 130; + a.target_read_id_ = 46; + a.target_position_in_read_ = 10320; + + Anchor b; + b.query_read_id_ = 12; + b.query_position_in_read_ = 10000; + b.target_read_id_ = 46; + b.target_position_in_read_ = 20400; + + Anchor c; + c.query_read_id_ = 12; + c.query_position_in_read_ = 15000; + c.target_read_id_ = 46; + c.target_position_in_read_ = 1000; + + Anchor d; + d.query_read_id_ = 12; + d.query_position_in_read_ = 20000; + d.target_read_id_ = 46; + d.target_position_in_read_ = 6000; + + std::vector anchors = {a, b, c, d}; + + std::vector run_starts_h = {0, 2}; + std::vector run_lengths_h = {2, 2}; + device_buffer run_starts_d(run_starts_h.size(), allocator, chain_alloc_cu_stream_ptr); + device_buffer run_lengths_d(run_starts_h.size(), allocator, chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(run_starts_h.data(), run_starts_h.size(), run_starts_d.data(), chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(run_lengths_h.data(), run_lengths_h.size(), run_lengths_d.data(), chain_alloc_cu_stream_ptr); + + Overlap p_ab = chainerutils::create_overlap(a, b, 2); + Overlap p_cd = chainerutils::create_overlap(c, d, 2); + std::vector overlaps = {p_ab, p_cd}; + device_buffer overlaps_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr); + device_buffer anchors_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), chain_alloc_cu_stream_ptr); + + device_buffer chain_starts(0, allocator, chain_alloc_cu_stream_ptr); + device_buffer unrolled_anchor_chains(0, allocator, chain_alloc_cu_stream_ptr); + int64_t num_total_anchors = 0; + chainerutils::allocate_anchor_chains(overlaps_d, + unrolled_anchor_chains, + chain_starts, + num_total_anchors, + allocator, + chain_alloc_cu_stream_ptr); + + chainerutils::output_overlap_chains_by_RLE<<<1024, 64, 0, chain_alloc_cu_stream_ptr>>>(overlaps_d.data(), + anchors_d.data(), + run_starts_d.data(), + run_lengths_d.data(), + unrolled_anchor_chains.data(), + chain_starts.data(), + overlaps.size()); + std::vector chain_starts_h; + chain_starts_h.resize(chain_starts.size()); + cudautils::device_copy_n(chain_starts.data(), chain_starts.size(), chain_starts_h.data(), chain_alloc_cu_stream_ptr); + std::vector chains_h; + chains_h.resize(unrolled_anchor_chains.size()); + cudautils::device_copy_n(unrolled_anchor_chains.data(), unrolled_anchor_chains.size(), chains_h.data(), chain_alloc_cu_stream_ptr); + ASSERT_EQ(unrolled_anchor_chains.size(), 4); + ASSERT_EQ(chain_starts.size(), 2); + ASSERT_EQ(chain_starts_h[0], 0); + ASSERT_EQ(chain_starts_h[1], 2); + ASSERT_EQ(chains_h[0], 0); + ASSERT_EQ(chains_h[2], 2); +} + +TEST(TestChainerUtils, Test_Output_Overlap_Chains_By_Backtrace) +{ + + DefaultDeviceAllocator allocator = create_default_device_allocator(); + + CudaStream chain_alloc_cuda_stream = make_cuda_stream(); + auto chain_alloc_cu_stream_ptr = chain_alloc_cuda_stream.get(); + + Anchor a; + a.query_read_id_ = 12; + a.query_position_in_read_ = 130; + a.target_read_id_ = 46; + a.target_position_in_read_ = 10320; + + Anchor b; + b.query_read_id_ = 12; + b.query_position_in_read_ = 10000; + b.target_read_id_ = 46; + b.target_position_in_read_ = 20400; + + Anchor c; + c.query_read_id_ = 12; + c.query_position_in_read_ = 15000; + c.target_read_id_ = 46; + c.target_position_in_read_ = 1000; + + std::vector anchors = {a, b, c}; + std::vector scores = {10, 20, 25}; + std::vector predecessors = {-1, 0, 0}; + std::vector overlap_terminal_anchors = {1, 2}; + + Overlap p_ab = chainerutils::create_overlap(a, b, 2); + Overlap p_ac = chainerutils::create_overlap(a, c, 2); + std::vector overlaps = {p_ab, p_ac}; + + device_buffer anchors_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr); + device_buffer overlaps_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr); + device_buffer mask_d(anchors.size(), allocator, chain_alloc_cu_stream_ptr); + device_buffer pred_d(predecessors.size(), allocator, chain_alloc_cu_stream_ptr); + device_buffer terminals_d(overlaps.size(), allocator, chain_alloc_cu_stream_ptr); + + cudautils::device_copy_n(anchors.data(), anchors.size(), anchors_d.data(), chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(overlaps.data(), overlaps.size(), overlaps_d.data(), chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(predecessors.data(), predecessors.size(), pred_d.data(), chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(overlap_terminal_anchors.data(), overlap_terminal_anchors.size(), terminals_d.data(), chain_alloc_cu_stream_ptr); + + device_buffer unrolled_anchor_chains(0, allocator, chain_alloc_cu_stream_ptr); + device_buffer chain_starts(0, allocator, chain_alloc_cu_stream_ptr); + int64_t num_total_anchors; + + chainerutils::allocate_anchor_chains(overlaps_d, + unrolled_anchor_chains, + chain_starts, + num_total_anchors, + allocator, + chain_alloc_cu_stream_ptr); + + chainerutils::output_overlap_chains_by_backtrace<<<1024, 64, 0, chain_alloc_cu_stream_ptr>>>(overlaps_d.data(), + anchors_d.data(), + mask_d.data(), + pred_d.data(), + terminals_d.data(), + unrolled_anchor_chains.data(), + chain_starts.data(), + overlaps.size(), + false); + + std::vector unrolled_chains_h; + unrolled_chains_h.resize(unrolled_anchor_chains.size()); + std::vector chain_starts_h; + chain_starts_h.resize(chain_starts.size()); + cudautils::device_copy_n(unrolled_anchor_chains.data(), unrolled_anchor_chains.size(), unrolled_chains_h.data(), chain_alloc_cu_stream_ptr); + cudautils::device_copy_n(chain_starts.data(), chain_starts.size(), chain_starts_h.data(), chain_alloc_cu_stream_ptr); + + ASSERT_EQ(chain_starts_h.size(), 2); + ASSERT_EQ(chain_starts_h[0], 0); + ASSERT_EQ(chain_starts_h[1], 2); + ASSERT_EQ(unrolled_chains_h[0], 0); + ASSERT_EQ(unrolled_chains_h[1], 1); + ASSERT_EQ(unrolled_chains_h[2], 0); + ASSERT_EQ(unrolled_chains_h[3], 2); + ASSERT_EQ(unrolled_anchor_chains.size(), 4); } } // namespace cudamapper