intial commit for raja launch

LLNL · Dec 27, 2024 · 17f3d15 · 17f3d15
1 parent 1d57b1a
commit 17f3d15
Show file tree

Hide file tree

Showing 3 changed files with 238 additions and 8 deletions.
diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt
@@ -9,6 +9,9 @@ if(RAJA_ENABLE_CALIPER)
   raja_add_executable(
     NAME raja-caliper
     SOURCES raja-caliper.cpp caliper-plugin.cpp)
+  raja_add_executable(
+    NAME raja-launch-caliper
+    SOURCES raja-launch-caliper.cpp caliper-plugin.cpp)
 endif()
 
 raja_add_executable(

diff --git a/examples/plugin/raja-launch-caliper.cpp b/examples/plugin/raja-launch-caliper.cpp
@@ -0,0 +1,216 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <caliper/cali.h>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
+/*
+ *  RAJA Caliper integration with launch
+ */
+
+using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+using loop_policy   = RAJA::LoopPolicy<RAJA::seq_exec>;
+
+//
+// Functions for checking and printing results
+//
+void checkResult(double* v1, double* v2, int len);
+void printResult(double* v, int len);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+  std::cout << "\n\nRAJA daxpy example...\n";
+
+//
+  auto timer = RAJA::Timer();
+
+
+//
+// Define vector length
+//
+  const int N = 1000000;
+
+//
+// Allocate and initialize vector data.
+//
+  double* a0 = new double[N];
+  double* aref = new double[N];
+
+  double* ta = new double[N];
+  double* tb = new double[N];
+
+  double c = 3.14159;
+
+  for (int i = 0; i < N; i++) {
+    a0[i] = 1.0;
+    tb[i] = 2.0;
+  }
+
+//
+// Declare and set pointers to array data.
+// We reset them for each daxpy version so that
+// they all look the same.
+//
+
+  double* a = ta;
+  double* b = tb;
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version of daxpy...\n";
+
+  std::memcpy( a, a0, N * sizeof(double) );
+  {
+    timer.start();
+    CALI_CXX_MARK_SCOPE("CALI: C-version elapsed time");
+    for (int i = 0; i < N; ++i) {
+      a[i] += b[i] * c;
+    }
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl;
+  }
+
+  std::memcpy( aref, a, N* sizeof(double) );
+
+//----------------------------------------------------------------------------//
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running launch sequential daxpy...\n";
+
+  std::memcpy( a, a0, N * sizeof(double) );
+ {
+   timer.reset();
+   timer.start();
+    RAJA::launch<launch_policy>
+      (RAJA::LaunchParams(RAJA::Teams(), RAJA::Threads()),
+       RAJA::expt::KernelName("CALI: launch kernel"),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+         RAJA::loop<loop_policy>(ctx, RAJA::RangeSegment(0, N), [&] (int i)
+         {
+           a[i] += b[i] * c;
+         });
+
+       });
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl;
+  }
+  checkResult(a, aref, N);
+//printResult(a, N);
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+//
+// RAJA CUDA parallel GPU version (256 threads per thread block).
+//
+  std::cout << "\n Running RAJA CUDA daxpy...\n";
+
+  a = 0; b = 0;
+  cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) ));
+  cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) ));
+
+  cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice ));
+
+  //RAJA::forall<RAJA::cuda_exec<256>>(RAJA::RangeSegment(0, N),
+  //[=] RAJA_DEVICE (int i) {
+  //a[i] += b[i] * c;
+  //});
+
+  cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost ));
+
+  cudaErrchk(cudaFree(a));
+  cudaErrchk(cudaFree(b));
+
+  a = ta;
+  checkResult(a, aref, N);
+//printResult(a, N);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+//
+// RAJA HIP parallel GPU version (256 threads per thread block).
+//
+  std::cout << "\n Running RAJA HIP daxpy...\n";
+
+  a = 0; b = 0;
+  hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) ));
+  hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) ));
+
+  hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice ));
+
+  //RAJA::forall<RAJA::hip_exec<256>>(RAJA::RangeSegment(0, N),
+  //[=] RAJA_DEVICE (int i) {
+  //a[i] += b[i] * c;
+  //});
+
+  hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost ));
+
+  hipErrchk(hipFree(a));
+  hipErrchk(hipFree(b));
+
+  a = ta;
+  checkResult(a, aref, N);
+//printResult(a, N);
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  delete[] a0;
+  delete[] aref;
+  delete[] ta;
+  delete[] tb;
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to compare result to reference and report P/F.
+//
+void checkResult(double* v1, double* v2, int len)
+{
+  bool match = true;
+  for (int i = 0; i < len; i++) {
+    if ( v1[i] != v2[i] ) { match = false; }
+  }
+  if ( match ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
+//
+// Function to print result.
+//
+void printResult(double* v, int len)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < len; i++) {
+    std::cout << "result[" << i << "] = " << v[i] << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
@@ -226,11 +226,14 @@ void launch(LaunchParams const &launch_params, const char *kernel_name, ReducePa
   //Get reducers
   auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
+  //kernel name
+  std::string kname;
+
   auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   //Take the first policy as we assume the second policy is not user defined.
   //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>(&kname)};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -260,11 +263,14 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_
   //Get reducers
   auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
+  //get kernel name
+  std::string kname;
+
   auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   //Take the first policy as we assume the second policy is not user defined.
   //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>(&kname)};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -409,15 +415,18 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
     place = RAJA::ExecPlace::DEVICE;
   }
 
+  //Get kernel name
+  std::string kname;
+
   //
   //Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
   util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+      util::make_context<typename POLICY_LIST::host_policy_t>(&kname) :
+      util::make_context<typename POLICY_LIST::device_policy_t>(&kname)};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>(&kname)};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -468,6 +477,8 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   //Get reducers
   auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
+  std::string kname;
+
   auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
@@ -482,10 +493,10 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   //
 #if defined(RAJA_GPU_ACTIVE)
   util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+      util::make_context<typename POLICY_LIST::host_policy_t>(&kname) :
+      util::make_context<typename POLICY_LIST::device_policy_t>(&kname)};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>(&kname)};
 #endif
 
   util::callPreCapturePlugins(context);