diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3eb0dbc8d2..beb2abf449 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,8 +154,20 @@ endif ()
 
 set (raja_depends)
 
+if (RAJA_ENABLE_CALIPER)
+set (raja_depends
+    ${raja_depends}
+    caliper)
+  find_package(caliper REQUIRED
+    NO_DEFAULT_PATH
+    PATHS ${caliper_DIR}
+  )
+message(STATUS "Using Caliper from location =  ${caliper_DIR}")
+endif()
+
 if (RAJA_ENABLE_OPENMP)
   set (raja_depends
+    ${raja_depends}
     openmp)
 endif()
 
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
index feb5247224..2002f39b6f 100644
--- a/examples/dynamic_mat_transpose.cpp
+++ b/examples/dynamic_mat_transpose.cpp
@@ -325,7 +325,7 @@ int main(int argc, char *argv[])
   RAJA::launch<launch_policy>
     (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr),
                              RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
-     "Matrix tranpose with dynamic shared memory kernel",
+     RAJA::expt::KernelName("Matrix tranpose with dynamic shared memory kernel"),
       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
   {
     RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){
diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp
index 5bec907c33..353a9b61e0 100644
--- a/examples/launch-param-reductions.cpp
+++ b/examples/launch-param-reductions.cpp
@@ -151,7 +151,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::Index_type seq_maxloc2(-1);
 
   RAJA::launch<LAUNCH_POL1>
-    (host_res, RAJA::LaunchParams(), "SeqReductionKernel",
+    (host_res, RAJA::LaunchParams(),
     RAJA::expt::Reduce<RAJA::operators::plus   >(&seq_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
@@ -159,6 +159,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
     RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&seq_min2, &seq_minloc2),
     RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&seq_max2, &seq_maxloc2),
+    RAJA::expt::KernelName("SeqReductionKernel"),
      [=] RAJA_HOST_DEVICE ( RAJA::LaunchContext ctx,
                             VALOP_INT_SUM &_seq_sum,
                             VALOP_INT_MIN &_seq_min,
@@ -218,7 +219,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   RAJA::Index_type omp_maxloc2(-1);
 
   RAJA::launch<LAUNCH_POL2>
-    (host_res, RAJA::LaunchParams(), "OmpReductionKernel",
+    (host_res, RAJA::LaunchParams(),
     RAJA::expt::Reduce<RAJA::operators::plus   >(&omp_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
@@ -226,6 +227,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
     RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&omp_min2, &omp_minloc2),
     RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&omp_max2, &omp_maxloc2),
+    RAJA::expt::KernelName("OmpReductionKernel"),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
                            VALOP_INT_SUM &_omp_sum,
                            VALOP_INT_MIN &_omp_min,
@@ -291,7 +293,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<LAUNCH_POL3>
     (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)),
-     "CUDAReductionKernel",
     RAJA::expt::Reduce<RAJA::operators::plus   >(&cuda_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
@@ -299,6 +300,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
     RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&cuda_min2, &cuda_minloc2),
     RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&cuda_max2, &cuda_maxloc2),
+    RAJA::expt::KernelName( "CUDAReductionKernel"),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
                            VALOP_INT_SUM &_cuda_sum,
                            VALOP_INT_MIN &_cuda_min,
@@ -368,7 +370,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<LAUNCH_POL3>
     (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)),
-     "HipReductionKernel",
     RAJA::expt::Reduce<RAJA::operators::plus   >(&hip_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
@@ -376,6 +377,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
     RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&hip_min2, &hip_minloc2),
     RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&hip_max2, &hip_maxloc2),
+    RAJA::expt::KernelName( "HipReductionKernel"),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
                            VALOP_INT_SUM &_hip_sum,
                            VALOP_INT_MIN &_hip_min,
@@ -442,7 +444,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<LAUNCH_POL4>
     (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)),
-     "SyclReductionKernel",
     RAJA::expt::Reduce<RAJA::operators::plus   >(&sycl_sum),
     RAJA::expt::Reduce<RAJA::operators::minimum>(&sycl_min),
     RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_max),
@@ -450,7 +451,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
     RAJA::expt::Reduce<RAJA::operators::maximum>(&sycl_maxloc),
     RAJA::expt::ReduceLoc<RAJA::operators::minimum>(&sycl_min2, &sycl_minloc2),
     RAJA::expt::ReduceLoc<RAJA::operators::maximum>(&sycl_max2, &sycl_maxloc2),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
+    RAJA::expt::KernelName( "SyclReductionKernel"),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx,
                            VALOP_INT_SUM &_sycl_sum,
                            VALOP_INT_MIN &_sycl_min,
                            VALOP_INT_MAX &_sycl_max,
diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp
index 24e313e649..13c90b28da 100644
--- a/examples/launch_reductions.cpp
+++ b/examples/launch_reductions.cpp
@@ -153,7 +153,7 @@ int main(int argc, char *argv[])
     (select_cpu_or_gpu,
      RAJA::LaunchParams(RAJA::Teams(GRID_SZ),
                         RAJA::Threads(TEAM_SZ)),
-     "Launch Reductions",
+     RAJA::expt::KernelName("Launch Reductions"),
      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) 
      {
 
diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt
index a01de12e22..d4f0da8c5d 100644
--- a/examples/plugin/CMakeLists.txt
+++ b/examples/plugin/CMakeLists.txt
@@ -5,6 +5,15 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ################################################################################
 
+if(RAJA_ENABLE_CALIPER)
+  raja_add_executable(
+    NAME raja-forall-caliper
+    SOURCES raja-forall-caliper.cpp caliper-plugin.cpp)
+  raja_add_executable(
+    NAME raja-launch-caliper
+    SOURCES raja-launch-caliper.cpp caliper-plugin.cpp)
+endif()
+
 raja_add_executable(
   NAME plugin-example
   SOURCES test-plugin.cpp counter-plugin.cpp)
@@ -13,7 +22,7 @@ if (RAJA_ENABLE_RUNTIME_PLUGINS)
   raja_add_executable(
     NAME plugin-example-dynamic
     SOURCES test-plugin-dynamic.cpp)
-    
+
   raja_add_plugin_library(NAME timer_plugin
                           SHARED TRUE
                           SOURCES timer-plugin.cpp)
diff --git a/examples/plugin/caliper-plugin.cpp b/examples/plugin/caliper-plugin.cpp
new file mode 100644
index 0000000000..246d98307d
--- /dev/null
+++ b/examples/plugin/caliper-plugin.cpp
@@ -0,0 +1,37 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA/util/PluginStrategy.hpp"
+
+#include <iostream>
+#include <caliper/cali.h>
+
+class CaliperPlugin : public RAJA::util::PluginStrategy
+{
+public:
+  void preLaunch(const RAJA::util::PluginContext&p) override
+  {
+    if(!p.kernel_name->empty()) CALI_MARK_BEGIN(p.kernel_name->c_str());
+  }
+
+  void postLaunch(const RAJA::util::PluginContext& p) override
+  {
+    if(!p.kernel_name->empty()) CALI_MARK_END(p.kernel_name->c_str());
+  }
+
+private:
+
+};
+
+// Dynamically loading plugin.
+extern "C" RAJA::util::PluginStrategy *getPlugin()
+{
+  return new CaliperPlugin;
+}
+
+// Statically loading plugin.
+static RAJA::util::PluginRegistry::add<CaliperPlugin> P("Caliper", "Enables Caliper Profiling");
diff --git a/examples/plugin/raja-forall-caliper.cpp b/examples/plugin/raja-forall-caliper.cpp
new file mode 100644
index 0000000000..9c0b9a01b9
--- /dev/null
+++ b/examples/plugin/raja-forall-caliper.cpp
@@ -0,0 +1,295 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <caliper/cali.h>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
+/*
+ *  Daxpy Example
+ *
+ *  Computes a += b*c, where a, b are vectors of doubles
+ *  and c is a scalar double. It illustrates similarities between a
+ *  C-style for-loop and a RAJA forall loop.
+ *
+ *  RAJA features shown:
+ *    - `forall` loop iteration template method
+ *    -  Index range segment
+ *    -  Execution policies
+ */
+
+//
+// Functions for checking and printing results
+//
+void checkResult(double* v1, double* v2, int len);
+void printResult(double* v, int len);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+  std::cout << "\n\nRAJA daxpy example...\n";
+
+//
+  auto timer = RAJA::Timer();
+
+
+//
+// Define vector length
+//
+  const int N = 1000000;
+
+//
+// Allocate and initialize vector data.
+//
+  double* a0 = new double[N];
+  double* aref = new double[N];
+
+  double* ta = new double[N];
+  double* tb = new double[N];
+
+  double c = 3.14159;
+
+  for (int i = 0; i < N; i++) {
+    a0[i] = 1.0;
+    tb[i] = 2.0;
+  }
+
+//
+// Declare and set pointers to array data.
+// We reset them for each daxpy version so that
+// they all look the same.
+//
+
+  double* a = ta;
+  double* b = tb;
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version of daxpy...\n";
+
+  std::memcpy( a, a0, N * sizeof(double) );
+  {
+    timer.start();
+    CALI_CXX_MARK_SCOPE("C-version elapsed time");
+    for (int i = 0; i < N; ++i) {
+      a[i] += b[i] * c;
+    }
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl;
+  }
+
+  std::memcpy( aref, a, N* sizeof(double) );
+
+//----------------------------------------------------------------------------//
+
+//
+// In the following, we show a RAJA version
+// of the daxpy operation and how it can
+// be run differently by choosing different
+// RAJA execution policies.
+//
+// Note that the only thing that changes in
+// these versions is the execution policy.
+// To implement these cases using the
+// programming model choices directly, would
+// require unique changes for each.
+//
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential daxpy...\n";
+
+  std::memcpy( a, a0, N * sizeof(double) );
+ {
+    timer.reset();
+    timer.start();
+    RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N),     
+      RAJA::expt::KernelName("RAJA Seq daxpy Kernel"), [=] (int i) {
+
+        a[i] += b[i] * c;
+
+    });
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "RAJA-Seq elapsed time : " << etime << " seconds" << std::endl;
+  }
+  checkResult(a, aref, N);
+//printResult(a, N);
+
+
+//----------------------------------------------------------------------------//
+
+//
+// RAJA SIMD version.
+//
+  std::cout << "\n Running RAJA SIMD daxpy...\n";
+  std::memcpy( a, a0, N * sizeof(double) );
+  {
+    timer.reset();
+    timer.start();
+    RAJA::forall<RAJA::simd_exec>
+      (RAJA::RangeSegment(0, N),
+       RAJA::expt::KernelName("RAJA SIMD daxpy Kernel"),
+       [=] (int i) {
+         a[i] += b[i] * c;
+       });
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "RAJA-SIMD elapsed time : " << etime << " seconds" << std::endl;
+    checkResult(a, aref, N);
+  }
+//printResult(a, N);
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running RAJA OpenMP daxpy...\n";
+
+  std::memcpy( a, a0, N * sizeof(double) );
+  {
+    timer.reset();
+    timer.start();
+    RAJA::forall<RAJA::omp_parallel_for_exec>
+      (RAJA::RangeSegment(0, N),
+       RAJA::expt::KernelName("RAJA OpenMP daxpy Kernel"),
+       [=] (int i) {
+         a[i] += b[i] * c;
+       });
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "RAJA-OMP elapsed time : " << etime << " seconds" << std::endl;
+  }
+  checkResult(a, aref, N);
+//printResult(a, N);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+//
+// RAJA CUDA parallel GPU version (256 threads per thread block).
+//
+  std::cout << "\n Running RAJA CUDA daxpy...\n";
+
+  a = 0; b = 0;
+  cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) ));
+  cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) ));
+
+  cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice ));
+
+  {
+    timer.reset();
+    timer.start();
+    RAJA::forall<RAJA::cuda_exec<256>>
+      (RAJA::RangeSegment(0, N),
+       RAJA::expt::KernelName("RAJA CUDA daxpy Kernel"),
+       [=] RAJA_DEVICE (int i) {
+         a[i] += b[i] * c;
+       });
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "RAJA-CUDA elapsed time : " << etime << " seconds" << std::endl;
+  }
+
+  cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost ));
+
+  cudaErrchk(cudaFree(a));
+  cudaErrchk(cudaFree(b));
+
+  a = ta;
+  checkResult(a, aref, N);
+//printResult(a, N);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+//
+// RAJA HIP parallel GPU version (256 threads per thread block).
+//
+  std::cout << "\n Running RAJA HIP daxpy...\n";
+
+  a = 0; b = 0;
+  hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) ));
+  hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) ));
+
+  hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice ));
+
+  {
+    timer.reset();
+    timer.start();
+    RAJA::forall<RAJA::hip_exec<256>>
+      (RAJA::RangeSegment(0, N),
+       RAJA::expt::KernelName("RAJA HIP daxpy Kernel"),
+       [=] RAJA_DEVICE (int i) {
+         a[i] += b[i] * c;
+       });
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "RAJA-HIP elapsed time : " << etime << " seconds" << std::endl;
+  }
+
+  hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost ));
+
+  hipErrchk(hipFree(a));
+  hipErrchk(hipFree(b));
+
+  a = ta;
+  checkResult(a, aref, N);
+//printResult(a, N);
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  delete[] a0;
+  delete[] aref;
+  delete[] ta;
+  delete[] tb;
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to compare result to reference and report P/F.
+//
+void checkResult(double* v1, double* v2, int len)
+{
+  bool match = true;
+  for (int i = 0; i < len; i++) {
+    if ( v1[i] != v2[i] ) { match = false; }
+  }
+  if ( match ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
+//
+// Function to print result.
+//
+void printResult(double* v, int len)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < len; i++) {
+    std::cout << "result[" << i << "] = " << v[i] << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/examples/plugin/raja-launch-caliper.cpp b/examples/plugin/raja-launch-caliper.cpp
new file mode 100644
index 0000000000..46cc6f1f2c
--- /dev/null
+++ b/examples/plugin/raja-launch-caliper.cpp
@@ -0,0 +1,241 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <caliper/cali.h>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
+/*
+ *  RAJA Caliper integration with launch
+ */
+
+//
+// Functions for checking and printing results
+//
+void checkResult(double* v1, double* v2, int len);
+void printResult(double* v, int len);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+  std::cout << "\n\nRAJA daxpy example...\n";
+
+//
+  auto timer = RAJA::Timer();
+
+
+//
+// Define vector length
+//
+  const int N = 1000000;
+
+//
+// Define compute grid
+//
+  const int Nthreads = 256;
+  const int Nteams   = (N-1)/Nthreads + 1;
+
+//
+// Allocate and initialize vector data.
+//
+  double* a0 = new double[N];
+  double* aref = new double[N];
+
+  double* ta = new double[N];
+  double* tb = new double[N];
+
+  double c = 3.14159;
+
+  for (int i = 0; i < N; i++) {
+    a0[i] = 1.0;
+    tb[i] = 2.0;
+  }
+
+//
+// Declare and set pointers to array data.
+// We reset them for each daxpy version so that
+// they all look the same.
+//
+
+  double* a = ta;
+  double* b = tb;
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version of daxpy...\n";
+
+  std::memcpy( a, a0, N * sizeof(double) );
+  {
+    timer.reset();
+    timer.start();
+    CALI_CXX_MARK_SCOPE("CALI: C-version elapsed time");
+    for (int i = 0; i < N; ++i) {
+      a[i] += b[i] * c;
+    }
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl;
+  }
+
+  std::memcpy( aref, a, N* sizeof(double) );
+
+//----------------------------------------------------------------------------//
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running launch sequential daxpy...\n";
+
+  std::memcpy( a, a0, N * sizeof(double) );
+ {
+   using seq_launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+   using seq_loop_policy   = RAJA::LoopPolicy<RAJA::seq_exec>;
+
+   timer.reset();
+   timer.start();
+   RAJA::launch<seq_launch_policy>
+     (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)),
+      RAJA::expt::KernelName("CALI: launch kernel"),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<seq_loop_policy>(ctx, RAJA::RangeSegment(0, N), [&] (int i)
+        {
+          a[i] += b[i] * c;
+        });
+
+    });
+    timer.stop();
+    RAJA::Timer::ElapsedType etime = timer.elapsed();
+    std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl;
+  }
+  checkResult(a, aref, N);
+//printResult(a, N);
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+//
+// RAJA CUDA parallel GPU version (256 threads per thread block).
+//
+  std::cout << "\n Running RAJA CUDA daxpy...\n";
+  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+  using cuda_loop_policy   = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+
+  a = 0; b = 0;
+  cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) ));
+  cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) ));
+
+  cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice ));
+
+   RAJA::launch<cuda_launch_policy>
+     (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)),
+      RAJA::expt::KernelName("CALI: launch Cuda kernel"),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<cuda_loop_policy>(ctx, RAJA::RangeSegment(0, N), [&] (int i)
+        {
+          a[i] += b[i] * c;
+        });
+
+    });
+
+  cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost ));
+
+  cudaErrchk(cudaFree(a));
+  cudaErrchk(cudaFree(b));
+
+  a = ta;
+  checkResult(a, aref, N);
+//printResult(a, N);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+//
+// RAJA HIP parallel GPU version (256 threads per thread block).
+//
+  std::cout << "\n Running RAJA HIP daxpy...\n";
+  using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>;
+  using hip_loop_policy   = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
+
+  a = 0; b = 0;
+  hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) ));
+  hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) ));
+
+  hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice ));
+
+   RAJA::launch<hip_launch_policy>
+     (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)),
+      RAJA::expt::KernelName("CALI: launch Cuda kernel"),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<hip_loop_policy>(ctx, RAJA::RangeSegment(0, N), [&] (int i)
+        {
+          a[i] += b[i] * c;
+        });
+
+    });
+
+  hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost ));
+
+  hipErrchk(hipFree(a));
+  hipErrchk(hipFree(b));
+
+  a = ta;
+  checkResult(a, aref, N);
+//printResult(a, N);
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  delete[] a0;
+  delete[] aref;
+  delete[] ta;
+  delete[] tb;
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to compare result to reference and report P/F.
+//
+void checkResult(double* v1, double* v2, int len)
+{
+  bool match = true;
+  for (int i = 0; i < len; i++) {
+    if ( v1[i] != v2[i] ) { match = false; }
+  }
+  if ( match ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
+//
+// Function to print result.
+//
+void printResult(double* v, int len)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < len; i++) {
+    std::cout << "result[" << i << "] = " << v[i] << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
index 29d97fed69..b756f3e297 100644
--- a/include/RAJA/config.hpp.in
+++ b/include/RAJA/config.hpp.in
@@ -179,6 +179,7 @@ static_assert(RAJA_HAS_SOME_CXX14,
 #cmakedefine RAJA_ENABLE_OMP_TASK
 #cmakedefine RAJA_ENABLE_VECTORIZATION
 
+#cmakedefine RAJA_ENABLE_CALIPER
 #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT
 #cmakedefine RAJA_ENABLE_ROCTX
 
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index e75cc43af7..45d669114a 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -509,6 +509,25 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
+template<typename T>
+struct get_kernel_name
+{
+  template<typename U>
+  static std::string get(U &)
+  {
+    return std::string(); //return empty string
+  }
+};
+
+template<>
+struct get_kernel_name<RAJA::expt::detail::KernelName&&>
+{
+  static std::string get(const RAJA::expt::detail::KernelName &kernel_name)
+  {
+    return kernel_name.name;
+  }
+};
+
 template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
@@ -521,10 +540,15 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
                 "Container does not model RandomAccessIterator");
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
+
+  auto&& kernel_name =  expt::get_kernel_name(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
+
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  std::string kname = get_kernel_name<decltype(kernel_name)>::get(kernel_name);
+
+  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>(&kname)};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 7ea7ce57ef..5961cb9fc9 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -218,53 +218,22 @@ class LaunchContext
 template <typename LAUNCH_POLICY>
 struct LaunchExecute;
 
-//Policy based launch with support to new reducers...
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
-{
-
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
-
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
-
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
-  util::callPreCapturePlugins(context);
-
-  using RAJA::util::trigger_updates_before;
-  auto p_body = trigger_updates_before(launch_body);
-
-  util::callPostCapturePlugins(context);
-
-  util::callPreLaunchPlugins(context);
-
-  using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
-
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
-
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
-
-  util::callPostLaunchPlugins(context);
-}
-
-
 //Duplicate of code above on account that we need to support the case in which a kernel_name is not given
 template <typename LAUNCH_POLICY, typename ... ReduceParams>
 void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args)
 {
-
-  const char *kernel_name = nullptr;
-
   //Get reducers
   auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
+  //get kernel name
+  auto&& kernel_name =  expt::get_kernel_name(std::forward<ReduceParams>(rest_of_launch_args)...);
+  std::string kname = get_kernel_name<decltype(kernel_name)>::get(kernel_name);
+
   auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   //Take the first policy as we assume the second policy is not user defined.
   //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>(&kname)};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -278,7 +247,7 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_
 
   using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, nullptr, p_body, reducers);
 
   util::callPostLaunchPlugins(context);
 }
@@ -292,78 +261,25 @@ void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
   launch<POLICY_LIST>(place, params, nullptr, body);
 }
 
-template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
-{
-
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
-#if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(Res::get_default(), params, kernel_name, body);
-      break;
-    }
-#endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-  }
-
-}
-
-//Run-time API for new reducer interface
-template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
-{
-
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
-#if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
-      break;
-    }
-#endif
-    default:
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-  }
-
-}
-
 //Run-time API for new reducer interface with support of the case without a new kernel name
 template <typename POLICY_LIST, typename... ReduceParams>
 void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args)
             //BODY const &body)
 {
 
-  const char *kernel_name = nullptr;
-
   //Forward to single policy launch API - simplifies testing of plugins
   switch (place) {
     case ExecPlace::HOST: {
       using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
       launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+        (Res::get_default(), launch_params, std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #if defined(RAJA_GPU_ACTIVE)
   case ExecPlace::DEVICE: {
       using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
       launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+        (Res::get_default(), launch_params, std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #endif
@@ -391,70 +307,6 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
 }
 
 //Launch API which takes team resource struct and supports new reducers
-template <typename POLICY_LIST, typename ... ReduceParams>
-resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *kernel_name, ReduceParams&&... rest_of_launch_args)
-{
-
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
-
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
-
-  ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
-    place = RAJA::ExecPlace::HOST;
-  } else {
-    place = RAJA::ExecPlace::DEVICE;
-  }
-
-  //
-  //Configure plugins
-  //
-#if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
-#else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
-#endif
-
-  util::callPreCapturePlugins(context);
-
-  using RAJA::util::trigger_updates_before;
-  auto p_body = trigger_updates_before(launch_body);
-
-  util::callPostCapturePlugins(context);
-
-  util::callPreLaunchPlugins(context);
-
-  switch (place) {
-    case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
-#if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name,  p_body, reducers);
-      util::callPostLaunchPlugins(context);
-      return e_proxy;
-    }
-#endif
-    default: {
-      RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
-    }
-  }
-
-  RAJA_ABORT_OR_THROW("Unknown launch place");
-
-  //^^ RAJA will abort before getting here
-  return resources::EventProxy<resources::Resource>(res);
-}
-
 
 //Duplicate of API above on account that we need to handle the case that a kernel name is not provided
 template <typename POLICY_LIST, typename ... ReduceParams>
@@ -463,11 +315,12 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
        ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
-
   //Get reducers
   auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
 
+  auto&& kernel_name =  expt::get_kernel_name(std::forward<ReduceParams>(rest_of_launch_args)...);
+  std::string kname = get_kernel_name<decltype(kernel_name)>::get(kernel_name);
+
   auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
@@ -482,10 +335,10 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   //
 #if defined(RAJA_GPU_ACTIVE)
   util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+      util::make_context<typename POLICY_LIST::host_policy_t>(&kname) :
+      util::make_context<typename POLICY_LIST::device_policy_t>(&kname)};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>(&kname)};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -500,14 +353,14 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, nullptr, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
 #if defined(RAJA_GPU_ACTIVE)
     case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, nullptr, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 5a656206f5..623d6f48b1 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -36,7 +36,7 @@ namespace expt
     using Base = camp::tuple<Params...>;
     Base param_tup;
 
-    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
+    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
     using params_seq = camp::make_idx_seq_t< param_tup_sz >;
 
   private:
@@ -59,7 +59,7 @@ namespace expt
     static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
       CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
     }
-    
+
     // Resolve
     template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
     static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
@@ -75,7 +75,7 @@ namespace expt
     static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
 
     using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
-    
+
     //Use the size of param_tup to generate the argument list.
     RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
     RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
@@ -93,8 +93,8 @@ namespace expt
 
     template<typename... Ts>
     ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
-  }; // struct ForallParamPack 
-  
+  }; // struct ForallParamPack
+
 
 
   //===========================================================================
@@ -151,7 +151,7 @@ namespace expt
     return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
   }
 
-  
+
 
   namespace detail {
     // Maybe we should do a lot of these with structs...
@@ -171,7 +171,7 @@ namespace expt
   template<typename... Args>
   constexpr auto make_forall_param_pack(Args&&... args){
     // We assume the last element of the pack is the lambda so we need to strip it from the list.
-    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
+    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) );
     return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
   }
   //===========================================================================
@@ -187,7 +187,25 @@ namespace expt
   template<typename... Args>
   constexpr auto&& get_lambda(Args&&... args){
     return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
-  } 
+  }
+  //===========================================================================
+
+  //===========================================================================
+  //
+  //
+  // kernel_name is expected to be the second to last argument, just extract it
+  //
+  //
+  template<typename... Args, typename std::enable_if<sizeof...(Args) >= 2, bool>::type = true>
+  constexpr auto&& get_kernel_name(Args&&... args){
+    return camp::get<sizeof...(Args)-2>( camp::forward_as_tuple(std::forward<Args>(args)...) );
+  }
+
+
+  template<typename... Args, typename std::enable_if<sizeof...(Args) < 2, bool>::type = true>
+  constexpr auto&& get_kernel_name(Args&&... args){
+    return camp::get<0>( camp::forward_as_tuple(std::forward<Args>(args)...) );
+  }
   //===========================================================================
 
 
@@ -200,10 +218,10 @@ namespace expt
   //
   namespace detail {
 
-    // 
+    //
     //
     // Lambda traits Utilities
-    // 
+    //
     //
     template<class F>
     struct lambda_traits;
@@ -211,28 +229,28 @@ namespace expt
     template<class R, class C, class First, class... Rest>
     struct lambda_traits<R (C::*)(First, Rest...)>
     {  // non-const specialization
-      using arg_type = First; 
+      using arg_type = First;
     };
     template<class R, class C, class First, class... Rest>
     struct lambda_traits<R (C::*)(First, Rest...) const>
     {  // const specialization
-      using arg_type = First; 
+      using arg_type = First;
     };
 
     template<class T>
     typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
 
 
-    // 
+    //
     //
     // List manipulation Utilities
-    // 
+    //
     //
     template<typename... Ts>
     constexpr auto list_remove_pointer(const camp::list<Ts...>&){
       return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
     }
-    
+
     template<typename... Ts>
     constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
       return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
@@ -276,7 +294,7 @@ namespace expt
     template<typename LAMBDA, typename... EXPECTED_ARGS>
     constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
 #if !defined(RAJA_ENABLE_HIP)
-      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); 
+      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments.");
 #endif
     }
 
@@ -284,7 +302,7 @@ namespace expt
 
 
   template<typename Lambda, typename ForallParams>
-  constexpr 
+  constexpr
   void
   check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
 
@@ -299,7 +317,7 @@ namespace expt
     detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
   }
   //===========================================================================
-  
+
 
 
   //===========================================================================
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index 4edf645ed3..a3edc3296a 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -16,7 +16,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
   init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
   {
-#if defined(RAJA_ENABLE_NV_TOOLS_EXT)
+#if defined(RAJA_ENABLE_NV_TOOLS_EXT) && !defined(RAJA_ENABLE_CALIPER)
     nvtxRangePush(kn.name);
 #else
     RAJA_UNUSED_VAR(kn);
@@ -34,7 +34,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
   resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
   {
-#if defined(RAJA_ENABLE_NV_TOOLS_EXT)
+#if defined(RAJA_ENABLE_NV_TOOLS_EXT) && !defined(RAJA_ENABLE_CALIPER)
     nvtxRangePop();
 #endif
   }
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 30269f8406..8a001c612a 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -20,7 +20,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
   init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
   {
-#if defined(RAJA_ENABLE_ROCTX)
+#if defined(RAJA_ENABLE_ROCTX) && !defined(RAJA_ENABLE_CALIPER)
     roctxRangePush(kn.name);
 #else
     RAJA_UNUSED_VAR(kn);
@@ -38,7 +38,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
   resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
   {
-#if defined(RAJA_ENABLE_ROCTX)
+#if defined(RAJA_ENABLE_ROCTX) && !defined(RAJA_ENABLE_CALIPER)
     roctxRangePop();
 #endif
   }
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 996836e397..2bc7dc90f4 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -8,6 +8,8 @@
 #ifndef RAJA_plugin_context_HPP
 #define RAJA_plugin_context_HPP
 
+#include <string>
+
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/internal/get_platform.hpp"
 
@@ -18,10 +20,11 @@ class KokkosPluginLoader;
 
 struct PluginContext {
   public:
-    PluginContext(const Platform p) :
-      platform(p) {}
+   PluginContext(const Platform p, const std::string *name = nullptr) :
+   platform(p), kernel_name(name) {}
 
-    Platform platform;
+  Platform platform;
+  const std::string *kernel_name;
 
   private:
     mutable uint64_t kID;
@@ -30,9 +33,9 @@ struct PluginContext {
 };
 
 template<typename Policy>
-PluginContext make_context()
+PluginContext make_context(const std::string *name=nullptr)
 {
-  return PluginContext{detail::get_platform<Policy>::value};
+  return PluginContext{detail::get_platform<Policy>::value, name};
 }
 
 } // closing brace for util namespace
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
index 91ab75dbab..f4f377ae43 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp
@@ -59,9 +59,9 @@ void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg,
   
   RAJA::launch<LAUNCH_POLICY>
     (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchMinBasicTest",
      RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
      RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+     RAJA::expt::KernelName("LaunchMinBasicTest"),
      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_mininit, REF_MIN &_min) {
 
       RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {
diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
index f6200628cf..f15bdc53cd 100644
--- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
+++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp
@@ -57,9 +57,9 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg,
 
   RAJA::launch<LAUNCH_POLICY>
     (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
-     "LaunchSumBasicTest",
      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
      RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+     RAJA::expt::KernelName("LaunchSumBasicTest"),     
      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum, REF_SUM &_sum2) {
 
       RAJA::loop<GLOBAL_THREAD_POLICY>(ctx, seg, [&](IDX_TYPE idx) {