Add static_map::insert_or_apply aka reduce-by-key (#515)

NVIDIA · Jul 4, 2024 · a7f87ac · a7f87ac
1 parent 2ad911f
commit a7f87ac
Show file tree

Hide file tree

Showing 10 changed files with 582 additions and 6 deletions.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -62,7 +62,8 @@ ConfigureBench(STATIC_MAP_BENCH
   hash_table/static_map/insert_bench.cu
   hash_table/static_map/find_bench.cu
   hash_table/static_map/contains_bench.cu
-  hash_table/static_map/erase_bench.cu)
+  hash_table/static_map/erase_bench.cu
+  hash_table/static_map/insert_or_apply_bench.cu)
 
 ###################################################################################################
 # - static_multiset benchmarks --------------------------------------------------------------------

diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys     = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy    = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY);
+
+  std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, static_cast<Value>(key));
+  });
+
+  state.add_element_count(num_keys);
+
+  cuco::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+  state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+    map.clear_async({launch.get_stream()});
+
+    timer.start();
+    map.insert_or_apply_async(
+      pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()});
+    timer.stop();
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_or_apply_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_or_apply_uniform_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
@@ -28,6 +28,7 @@
 namespace cuco::static_map_ns::detail {
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
+// TODO user insert_or_assign internally
 /**
  * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
  * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
@@ -67,4 +68,44 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
   }
 }
 
-}  // namespace cuco::static_map_ns::detail
+/**
+ * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
+ * `k` already exists in the container, then binary operation is applied using `op` callable object
+ * on the existing value at slot and the element to insert. If the key does not exist, inserts the
+ * pair as if by insert.
+ *
+ * @note Callable object to perform binary operation should be able to invoke as
+ * Op(cuda::atomic_ref<T,Scope>, T>)
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Op Callable type used to peform `apply` operation.
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param op Callable object to perform apply operation.
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
+__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename std::iterator_traits<InputIt>::value_type const& insert_pair = *(first + idx);
+    if constexpr (CGSize == 1) {
+      ref.insert_or_apply(insert_pair, op);
+    } else {
+      auto const tile =
+        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+      ref.insert_or_apply(tile, insert_pair, op);
+    }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace cuco::static_map_ns::detail
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
@@ -247,6 +247,44 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
       first, num, ref(op::insert_or_assign));
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream)
+{
+  return this->insert_or_apply_async(first, last, op, stream);
+  stream.wait();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply_async(InputIt first, InputIt last, Op op, cuda::stream_ref stream) noexcept
+{
+  auto const num = cuco::detail::distance(first, last);
+  if (num == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num, cg_size);
+
+  static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+      first, num, op, ref(op::insert_or_apply));
+}
+
 template <class Key,
           class T,
           class Extent,