diff --git a/CMakeLists.txt b/CMakeLists.txt index 3eb0dbc8d2..beb2abf449 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,8 +154,20 @@ endif () set (raja_depends) +if (RAJA_ENABLE_CALIPER) +set (raja_depends + ${raja_depends} + caliper) + find_package(caliper REQUIRED + NO_DEFAULT_PATH + PATHS ${caliper_DIR} + ) +message(STATUS "Using Caliper from location = ${caliper_DIR}") +endif() + if (RAJA_ENABLE_OPENMP) set (raja_depends + ${raja_depends} openmp) endif() diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp index feb5247224..2002f39b6f 100644 --- a/examples/dynamic_mat_transpose.cpp +++ b/examples/dynamic_mat_transpose.cpp @@ -325,7 +325,7 @@ int main(int argc, char *argv[]) RAJA::launch (res, RAJA::LaunchParams(RAJA::Teams(outer_Dimc, outer_Dimr), RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size), - "Matrix tranpose with dynamic shared memory kernel", + RAJA::expt::KernelName("Matrix tranpose with dynamic shared memory kernel"), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){ diff --git a/examples/launch-param-reductions.cpp b/examples/launch-param-reductions.cpp index 5bec907c33..353a9b61e0 100644 --- a/examples/launch-param-reductions.cpp +++ b/examples/launch-param-reductions.cpp @@ -151,7 +151,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type seq_maxloc2(-1); RAJA::launch - (host_res, RAJA::LaunchParams(), "SeqReductionKernel", + (host_res, RAJA::LaunchParams(), RAJA::expt::Reduce(&seq_sum), RAJA::expt::Reduce(&seq_min), RAJA::expt::Reduce(&seq_max), @@ -159,6 +159,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::expt::Reduce(&seq_maxloc), RAJA::expt::ReduceLoc(&seq_min2, &seq_minloc2), RAJA::expt::ReduceLoc(&seq_max2, &seq_maxloc2), + RAJA::expt::KernelName("SeqReductionKernel"), [=] RAJA_HOST_DEVICE ( RAJA::LaunchContext ctx, VALOP_INT_SUM &_seq_sum, VALOP_INT_MIN &_seq_min, @@ -218,7 +219,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::Index_type omp_maxloc2(-1); RAJA::launch - (host_res, RAJA::LaunchParams(), "OmpReductionKernel", + (host_res, RAJA::LaunchParams(), RAJA::expt::Reduce(&omp_sum), RAJA::expt::Reduce(&omp_min), RAJA::expt::Reduce(&omp_max), @@ -226,6 +227,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::expt::Reduce(&omp_maxloc), RAJA::expt::ReduceLoc(&omp_min2, &omp_minloc2), RAJA::expt::ReduceLoc(&omp_max2, &omp_maxloc2), + RAJA::expt::KernelName("OmpReductionKernel"), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, VALOP_INT_SUM &_omp_sum, VALOP_INT_MIN &_omp_min, @@ -291,7 +293,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(CUDA_BLOCK_SIZE)), - "CUDAReductionKernel", RAJA::expt::Reduce(&cuda_sum), RAJA::expt::Reduce(&cuda_min), RAJA::expt::Reduce(&cuda_max), @@ -299,6 +300,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::expt::Reduce(&cuda_maxloc), RAJA::expt::ReduceLoc(&cuda_min2, &cuda_minloc2), RAJA::expt::ReduceLoc(&cuda_max2, &cuda_maxloc2), + RAJA::expt::KernelName( "CUDAReductionKernel"), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, VALOP_INT_SUM &_cuda_sum, VALOP_INT_MIN &_cuda_min, @@ -368,7 +370,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(HIP_BLOCK_SIZE)), - "HipReductionKernel", RAJA::expt::Reduce(&hip_sum), RAJA::expt::Reduce(&hip_min), RAJA::expt::Reduce(&hip_max), @@ -376,6 +377,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::expt::Reduce(&hip_maxloc), RAJA::expt::ReduceLoc(&hip_min2, &hip_minloc2), RAJA::expt::ReduceLoc(&hip_max2, &hip_maxloc2), + RAJA::expt::KernelName( "HipReductionKernel"), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, VALOP_INT_SUM &_hip_sum, VALOP_INT_MIN &_hip_min, @@ -442,7 +444,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::launch (device_res, RAJA::LaunchParams(RAJA::Teams(NUMBER_OF_TEAMS), RAJA::Threads(SYCL_BLOCK_SIZE)), - "SyclReductionKernel", RAJA::expt::Reduce(&sycl_sum), RAJA::expt::Reduce(&sycl_min), RAJA::expt::Reduce(&sycl_max), @@ -450,7 +451,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::expt::Reduce(&sycl_maxloc), RAJA::expt::ReduceLoc(&sycl_min2, &sycl_minloc2), RAJA::expt::ReduceLoc(&sycl_max2, &sycl_maxloc2), - [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, + RAJA::expt::KernelName( "SyclReductionKernel"), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx, VALOP_INT_SUM &_sycl_sum, VALOP_INT_MIN &_sycl_min, VALOP_INT_MAX &_sycl_max, diff --git a/examples/launch_reductions.cpp b/examples/launch_reductions.cpp index 24e313e649..13c90b28da 100644 --- a/examples/launch_reductions.cpp +++ b/examples/launch_reductions.cpp @@ -153,7 +153,7 @@ int main(int argc, char *argv[]) (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), RAJA::Threads(TEAM_SZ)), - "Launch Reductions", + RAJA::expt::KernelName("Launch Reductions"), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt index a01de12e22..d4f0da8c5d 100644 --- a/examples/plugin/CMakeLists.txt +++ b/examples/plugin/CMakeLists.txt @@ -5,6 +5,15 @@ # SPDX-License-Identifier: (BSD-3-Clause) ################################################################################ +if(RAJA_ENABLE_CALIPER) + raja_add_executable( + NAME raja-forall-caliper + SOURCES raja-forall-caliper.cpp caliper-plugin.cpp) + raja_add_executable( + NAME raja-launch-caliper + SOURCES raja-launch-caliper.cpp caliper-plugin.cpp) +endif() + raja_add_executable( NAME plugin-example SOURCES test-plugin.cpp counter-plugin.cpp) @@ -13,7 +22,7 @@ if (RAJA_ENABLE_RUNTIME_PLUGINS) raja_add_executable( NAME plugin-example-dynamic SOURCES test-plugin-dynamic.cpp) - + raja_add_plugin_library(NAME timer_plugin SHARED TRUE SOURCES timer-plugin.cpp) diff --git a/examples/plugin/caliper-plugin.cpp b/examples/plugin/caliper-plugin.cpp new file mode 100644 index 0000000000..246d98307d --- /dev/null +++ b/examples/plugin/caliper-plugin.cpp @@ -0,0 +1,37 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "RAJA/util/PluginStrategy.hpp" + +#include +#include + +class CaliperPlugin : public RAJA::util::PluginStrategy +{ +public: + void preLaunch(const RAJA::util::PluginContext&p) override + { + if(!p.kernel_name->empty()) CALI_MARK_BEGIN(p.kernel_name->c_str()); + } + + void postLaunch(const RAJA::util::PluginContext& p) override + { + if(!p.kernel_name->empty()) CALI_MARK_END(p.kernel_name->c_str()); + } + +private: + +}; + +// Dynamically loading plugin. +extern "C" RAJA::util::PluginStrategy *getPlugin() +{ + return new CaliperPlugin; +} + +// Statically loading plugin. +static RAJA::util::PluginRegistry::add P("Caliper", "Enables Caliper Profiling"); diff --git a/examples/plugin/raja-forall-caliper.cpp b/examples/plugin/raja-forall-caliper.cpp new file mode 100644 index 0000000000..9c0b9a01b9 --- /dev/null +++ b/examples/plugin/raja-forall-caliper.cpp @@ -0,0 +1,295 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "RAJA/util/Timer.hpp" + +/* + * Daxpy Example + * + * Computes a += b*c, where a, b are vectors of doubles + * and c is a scalar double. It illustrates similarities between a + * C-style for-loop and a RAJA forall loop. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - Index range segment + * - Execution policies + */ + +// +// Functions for checking and printing results +// +void checkResult(double* v1, double* v2, int len); +void printResult(double* v, int len); + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + std::cout << "\n\nRAJA daxpy example...\n"; + +// + auto timer = RAJA::Timer(); + + +// +// Define vector length +// + const int N = 1000000; + +// +// Allocate and initialize vector data. +// + double* a0 = new double[N]; + double* aref = new double[N]; + + double* ta = new double[N]; + double* tb = new double[N]; + + double c = 3.14159; + + for (int i = 0; i < N; i++) { + a0[i] = 1.0; + tb[i] = 2.0; + } + +// +// Declare and set pointers to array data. +// We reset them for each daxpy version so that +// they all look the same. +// + + double* a = ta; + double* b = tb; + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version of daxpy...\n"; + + std::memcpy( a, a0, N * sizeof(double) ); + { + timer.start(); + CALI_CXX_MARK_SCOPE("C-version elapsed time"); + for (int i = 0; i < N; ++i) { + a[i] += b[i] * c; + } + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl; + } + + std::memcpy( aref, a, N* sizeof(double) ); + +//----------------------------------------------------------------------------// + +// +// In the following, we show a RAJA version +// of the daxpy operation and how it can +// be run differently by choosing different +// RAJA execution policies. +// +// Note that the only thing that changes in +// these versions is the execution policy. +// To implement these cases using the +// programming model choices directly, would +// require unique changes for each. +// + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential daxpy...\n"; + + std::memcpy( a, a0, N * sizeof(double) ); + { + timer.reset(); + timer.start(); + RAJA::forall(RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("RAJA Seq daxpy Kernel"), [=] (int i) { + + a[i] += b[i] * c; + + }); + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "RAJA-Seq elapsed time : " << etime << " seconds" << std::endl; + } + checkResult(a, aref, N); +//printResult(a, N); + + +//----------------------------------------------------------------------------// + +// +// RAJA SIMD version. +// + std::cout << "\n Running RAJA SIMD daxpy...\n"; + std::memcpy( a, a0, N * sizeof(double) ); + { + timer.reset(); + timer.start(); + RAJA::forall + (RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("RAJA SIMD daxpy Kernel"), + [=] (int i) { + a[i] += b[i] * c; + }); + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "RAJA-SIMD elapsed time : " << etime << " seconds" << std::endl; + checkResult(a, aref, N); + } +//printResult(a, N); + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA OpenMP daxpy...\n"; + + std::memcpy( a, a0, N * sizeof(double) ); + { + timer.reset(); + timer.start(); + RAJA::forall + (RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("RAJA OpenMP daxpy Kernel"), + [=] (int i) { + a[i] += b[i] * c; + }); + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "RAJA-OMP elapsed time : " << etime << " seconds" << std::endl; + } + checkResult(a, aref, N); +//printResult(a, N); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) +// +// RAJA CUDA parallel GPU version (256 threads per thread block). +// + std::cout << "\n Running RAJA CUDA daxpy...\n"; + + a = 0; b = 0; + cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) )); + cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) )); + + cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); + + { + timer.reset(); + timer.start(); + RAJA::forall> + (RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("RAJA CUDA daxpy Kernel"), + [=] RAJA_DEVICE (int i) { + a[i] += b[i] * c; + }); + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "RAJA-CUDA elapsed time : " << etime << " seconds" << std::endl; + } + + cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost )); + + cudaErrchk(cudaFree(a)); + cudaErrchk(cudaFree(b)); + + a = ta; + checkResult(a, aref, N); +//printResult(a, N); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) +// +// RAJA HIP parallel GPU version (256 threads per thread block). +// + std::cout << "\n Running RAJA HIP daxpy...\n"; + + a = 0; b = 0; + hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) )); + hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) )); + + hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice )); + + { + timer.reset(); + timer.start(); + RAJA::forall> + (RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("RAJA HIP daxpy Kernel"), + [=] RAJA_DEVICE (int i) { + a[i] += b[i] * c; + }); + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "RAJA-HIP elapsed time : " << etime << " seconds" << std::endl; + } + + hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost )); + + hipErrchk(hipFree(a)); + hipErrchk(hipFree(b)); + + a = ta; + checkResult(a, aref, N); +//printResult(a, N); +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + delete[] a0; + delete[] aref; + delete[] ta; + delete[] tb; + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to compare result to reference and report P/F. +// +void checkResult(double* v1, double* v2, int len) +{ + bool match = true; + for (int i = 0; i < len; i++) { + if ( v1[i] != v2[i] ) { match = false; } + } + if ( match ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + +// +// Function to print result. +// +void printResult(double* v, int len) +{ + std::cout << std::endl; + for (int i = 0; i < len; i++) { + std::cout << "result[" << i << "] = " << v[i] << std::endl; + } + std::cout << std::endl; +} diff --git a/examples/plugin/raja-launch-caliper.cpp b/examples/plugin/raja-launch-caliper.cpp new file mode 100644 index 0000000000..46cc6f1f2c --- /dev/null +++ b/examples/plugin/raja-launch-caliper.cpp @@ -0,0 +1,241 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "RAJA/util/Timer.hpp" + +/* + * RAJA Caliper integration with launch + */ + +// +// Functions for checking and printing results +// +void checkResult(double* v1, double* v2, int len); +void printResult(double* v, int len); + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + std::cout << "\n\nRAJA daxpy example...\n"; + +// + auto timer = RAJA::Timer(); + + +// +// Define vector length +// + const int N = 1000000; + +// +// Define compute grid +// + const int Nthreads = 256; + const int Nteams = (N-1)/Nthreads + 1; + +// +// Allocate and initialize vector data. +// + double* a0 = new double[N]; + double* aref = new double[N]; + + double* ta = new double[N]; + double* tb = new double[N]; + + double c = 3.14159; + + for (int i = 0; i < N; i++) { + a0[i] = 1.0; + tb[i] = 2.0; + } + +// +// Declare and set pointers to array data. +// We reset them for each daxpy version so that +// they all look the same. +// + + double* a = ta; + double* b = tb; + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version of daxpy...\n"; + + std::memcpy( a, a0, N * sizeof(double) ); + { + timer.reset(); + timer.start(); + CALI_CXX_MARK_SCOPE("CALI: C-version elapsed time"); + for (int i = 0; i < N; ++i) { + a[i] += b[i] * c; + } + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl; + } + + std::memcpy( aref, a, N* sizeof(double) ); + +//----------------------------------------------------------------------------// + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running launch sequential daxpy...\n"; + + std::memcpy( a, a0, N * sizeof(double) ); + { + using seq_launch_policy = RAJA::LaunchPolicy; + using seq_loop_policy = RAJA::LoopPolicy; + + timer.reset(); + timer.start(); + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)), + RAJA::expt::KernelName("CALI: launch kernel"), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) + { + a[i] += b[i] * c; + }); + + }); + timer.stop(); + RAJA::Timer::ElapsedType etime = timer.elapsed(); + std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl; + } + checkResult(a, aref, N); +//printResult(a, N); + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) +// +// RAJA CUDA parallel GPU version (256 threads per thread block). +// + std::cout << "\n Running RAJA CUDA daxpy...\n"; + using cuda_launch_policy = RAJA::LaunchPolicy>; + using cuda_loop_policy = RAJA::LoopPolicy; + + a = 0; b = 0; + cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) )); + cudaErrchk(cudaMalloc( (void**)&b, N * sizeof(double) )); + + cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)), + RAJA::expt::KernelName("CALI: launch Cuda kernel"), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) + { + a[i] += b[i] * c; + }); + + }); + + cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost )); + + cudaErrchk(cudaFree(a)); + cudaErrchk(cudaFree(b)); + + a = ta; + checkResult(a, aref, N); +//printResult(a, N); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) +// +// RAJA HIP parallel GPU version (256 threads per thread block). +// + std::cout << "\n Running RAJA HIP daxpy...\n"; + using hip_launch_policy = RAJA::LaunchPolicy>; + using hip_loop_policy = RAJA::LoopPolicy; + + a = 0; b = 0; + hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) )); + hipErrchk(hipMalloc( (void**)&b, N * sizeof(double) )); + + hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice )); + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)), + RAJA::expt::KernelName("CALI: launch Cuda kernel"), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) + { + a[i] += b[i] * c; + }); + + }); + + hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost )); + + hipErrchk(hipFree(a)); + hipErrchk(hipFree(b)); + + a = ta; + checkResult(a, aref, N); +//printResult(a, N); +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + delete[] a0; + delete[] aref; + delete[] ta; + delete[] tb; + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to compare result to reference and report P/F. +// +void checkResult(double* v1, double* v2, int len) +{ + bool match = true; + for (int i = 0; i < len; i++) { + if ( v1[i] != v2[i] ) { match = false; } + } + if ( match ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + +// +// Function to print result. +// +void printResult(double* v, int len) +{ + std::cout << std::endl; + for (int i = 0; i < len; i++) { + std::cout << "result[" << i << "] = " << v[i] << std::endl; + } + std::cout << std::endl; +} diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 29d97fed69..b756f3e297 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -179,6 +179,7 @@ static_assert(RAJA_HAS_SOME_CXX14, #cmakedefine RAJA_ENABLE_OMP_TASK #cmakedefine RAJA_ENABLE_VECTORIZATION +#cmakedefine RAJA_ENABLE_CALIPER #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT #cmakedefine RAJA_ENABLE_ROCTX diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp index e75cc43af7..45d669114a 100644 --- a/include/RAJA/pattern/forall.hpp +++ b/include/RAJA/pattern/forall.hpp @@ -509,6 +509,25 @@ forall_Icount(ExecutionPolicy&& p, ****************************************************************************** */ +template +struct get_kernel_name +{ + template + static std::string get(U &) + { + return std::string(); //return empty string + } +}; + +template<> +struct get_kernel_name +{ + static std::string get(const RAJA::expt::detail::KernelName &kernel_name) + { + return kernel_name.name; + } +}; + template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, @@ -521,10 +540,15 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params) "Container does not model RandomAccessIterator"); auto f_params = expt::make_forall_param_pack(std::forward(params)...); + + auto&& kernel_name = expt::get_kernel_name(std::forward(params)...); auto&& loop_body = expt::get_lambda(std::forward(params)...); + expt::check_forall_optional_args(loop_body, f_params); - util::PluginContext context{util::make_context>()}; + std::string kname = get_kernel_name::get(kernel_name); + + util::PluginContext context{util::make_context>(&kname)}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 7ea7ce57ef..5961cb9fc9 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -218,53 +218,22 @@ class LaunchContext template struct LaunchExecute; -//Policy based launch with support to new reducers... -template -void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args) -{ - - //Get reducers - auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); - - auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); - - //Take the first policy as we assume the second policy is not user defined. - //We rely on the user to pair launch and loop policies correctly. - util::PluginContext context{util::make_context()}; - util::callPreCapturePlugins(context); - - using RAJA::util::trigger_updates_before; - auto p_body = trigger_updates_before(launch_body); - - util::callPostCapturePlugins(context); - - util::callPreLaunchPlugins(context); - - using launch_t = LaunchExecute; - - using Res = typename resources::get_resource::type; - - launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers); - - util::callPostLaunchPlugins(context); -} - - //Duplicate of code above on account that we need to support the case in which a kernel_name is not given template void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args) { - - const char *kernel_name = nullptr; - //Get reducers auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); + //get kernel name + auto&& kernel_name = expt::get_kernel_name(std::forward(rest_of_launch_args)...); + std::string kname = get_kernel_name::get(kernel_name); + auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); //Take the first policy as we assume the second policy is not user defined. //We rely on the user to pair launch and loop policies correctly. - util::PluginContext context{util::make_context()}; + util::PluginContext context{util::make_context(&kname)}; util::callPreCapturePlugins(context); using RAJA::util::trigger_updates_before; @@ -278,7 +247,7 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_ using Res = typename resources::get_resource::type; - launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers); + launch_t::exec(Res::get_default(), launch_params, nullptr, p_body, reducers); util::callPostLaunchPlugins(context); } @@ -292,78 +261,25 @@ void launch(ExecPlace place, LaunchParams const ¶ms, BODY const &body) launch(place, params, nullptr, body); } -template -void launch(ExecPlace place, const LaunchParams ¶ms, const char *kernel_name, BODY const &body) -{ - - //Forward to single policy launch API - simplifies testing of plugins - switch (place) { - case ExecPlace::HOST: { - using Res = typename resources::get_resource::type; - launch>(Res::get_default(), params, kernel_name, body); - break; - } -#if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using Res = typename resources::get_resource::type; - launch>(Res::get_default(), params, kernel_name, body); - break; - } -#endif - default: - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); - } - -} - -//Run-time API for new reducer interface -template -void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args) -{ - - //Forward to single policy launch API - simplifies testing of plugins - switch (place) { - case ExecPlace::HOST: { - using Res = typename resources::get_resource::type; - launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); - break; - } -#if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using Res = typename resources::get_resource::type; - launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); - break; - } -#endif - default: - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); - } - -} - //Run-time API for new reducer interface with support of the case without a new kernel name template void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args) //BODY const &body) { - const char *kernel_name = nullptr; - //Forward to single policy launch API - simplifies testing of plugins switch (place) { case ExecPlace::HOST: { using Res = typename resources::get_resource::type; launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); + (Res::get_default(), launch_params, std::forward(rest_of_launch_args)...); break; } #if defined(RAJA_GPU_ACTIVE) case ExecPlace::DEVICE: { using Res = typename resources::get_resource::type; launch> - (Res::get_default(), launch_params, kernel_name, std::forward(rest_of_launch_args)...); + (Res::get_default(), launch_params, std::forward(rest_of_launch_args)...); break; } #endif @@ -391,70 +307,6 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){ } //Launch API which takes team resource struct and supports new reducers -template -resources::EventProxy -launch(RAJA::resources::Resource res, LaunchParams const &launch_params, - const char *kernel_name, ReduceParams&&... rest_of_launch_args) -{ - - //Get reducers - auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); - - auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); - - ExecPlace place; - if(res.get_platform() == RAJA::Platform::host) { - place = RAJA::ExecPlace::HOST; - } else { - place = RAJA::ExecPlace::DEVICE; - } - - // - //Configure plugins - // -#if defined(RAJA_GPU_ACTIVE) - util::PluginContext context{place == ExecPlace::HOST ? - util::make_context() : - util::make_context()}; -#else - util::PluginContext context{util::make_context()}; -#endif - - util::callPreCapturePlugins(context); - - using RAJA::util::trigger_updates_before; - auto p_body = trigger_updates_before(launch_body); - - util::callPostCapturePlugins(context); - - util::callPreLaunchPlugins(context); - - switch (place) { - case ExecPlace::HOST: { - using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); - util::callPostLaunchPlugins(context); - return e_proxy; - } -#if defined(RAJA_GPU_ACTIVE) - case ExecPlace::DEVICE: { - using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); - util::callPostLaunchPlugins(context); - return e_proxy; - } -#endif - default: { - RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled"); - } - } - - RAJA_ABORT_OR_THROW("Unknown launch place"); - - //^^ RAJA will abort before getting here - return resources::EventProxy(res); -} - //Duplicate of API above on account that we need to handle the case that a kernel name is not provided template @@ -463,11 +315,12 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args) { - const char *kernel_name = nullptr; - //Get reducers auto reducers = expt::make_forall_param_pack(std::forward(rest_of_launch_args)...); + auto&& kernel_name = expt::get_kernel_name(std::forward(rest_of_launch_args)...); + std::string kname = get_kernel_name::get(kernel_name); + auto&& launch_body = expt::get_lambda(std::forward(rest_of_launch_args)...); ExecPlace place; @@ -482,10 +335,10 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params, // #if defined(RAJA_GPU_ACTIVE) util::PluginContext context{place == ExecPlace::HOST ? - util::make_context() : - util::make_context()}; + util::make_context(&kname) : + util::make_context(&kname)}; #else - util::PluginContext context{util::make_context()}; + util::PluginContext context{util::make_context(&kname)}; #endif util::callPreCapturePlugins(context); @@ -500,14 +353,14 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params, switch (place) { case ExecPlace::HOST: { using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); + resources::EventProxy e_proxy = launch_t::exec(res, launch_params, nullptr, p_body, reducers); util::callPostLaunchPlugins(context); return e_proxy; } #if defined(RAJA_GPU_ACTIVE) case ExecPlace::DEVICE: { using launch_t = LaunchExecute; - resources::EventProxy e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers); + resources::EventProxy e_proxy = launch_t::exec(res, launch_params, nullptr, p_body, reducers); util::callPostLaunchPlugins(context); return e_proxy; } diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp index 5a656206f5..623d6f48b1 100644 --- a/include/RAJA/pattern/params/forall.hpp +++ b/include/RAJA/pattern/params/forall.hpp @@ -36,7 +36,7 @@ namespace expt using Base = camp::tuple; Base param_tup; - static constexpr size_t param_tup_sz = camp::tuple_size::value; + static constexpr size_t param_tup_sz = camp::tuple_size::value; using params_seq = camp::make_idx_seq_t< param_tup_sz >; private: @@ -59,7 +59,7 @@ namespace expt static constexpr void detail_combine(EXEC_POL, camp::idx_seq, ForallParamPack& f_params ) { CAMP_EXPAND(detail::combine( camp::get(f_params.param_tup) )); } - + // Resolve template static constexpr void detail_resolve(EXEC_POL, camp::idx_seq, ForallParamPack& f_params, Args&& ...args) { @@ -75,7 +75,7 @@ namespace expt static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T()); }; using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T()); - + //Use the size of param_tup to generate the argument list. RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); } RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get(param_tup).get_lambda_arg_tup(); } @@ -93,8 +93,8 @@ namespace expt template ForallParamPack(camp::tuple&& t) : param_tup(std::move(t)) {}; - }; // struct ForallParamPack - + }; // struct ForallParamPack + //=========================================================================== @@ -151,7 +151,7 @@ namespace expt return ForallParamPack...>(std::move(tuple)); } - + namespace detail { // Maybe we should do a lot of these with structs... @@ -171,7 +171,7 @@ namespace expt template constexpr auto make_forall_param_pack(Args&&... args){ // We assume the last element of the pack is the lambda so we need to strip it from the list. - auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward(args)...) ); + auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward(args)...) ); return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple)); } //=========================================================================== @@ -187,7 +187,25 @@ namespace expt template constexpr auto&& get_lambda(Args&&... args){ return camp::get( camp::forward_as_tuple(std::forward(args)...) ); - } + } + //=========================================================================== + + //=========================================================================== + // + // + // kernel_name is expected to be the second to last argument, just extract it + // + // + template= 2, bool>::type = true> + constexpr auto&& get_kernel_name(Args&&... args){ + return camp::get( camp::forward_as_tuple(std::forward(args)...) ); + } + + + template::type = true> + constexpr auto&& get_kernel_name(Args&&... args){ + return camp::get<0>( camp::forward_as_tuple(std::forward(args)...) ); + } //=========================================================================== @@ -200,10 +218,10 @@ namespace expt // namespace detail { - // + // // // Lambda traits Utilities - // + // // template struct lambda_traits; @@ -211,28 +229,28 @@ namespace expt template struct lambda_traits { // non-const specialization - using arg_type = First; + using arg_type = First; }; template struct lambda_traits { // const specialization - using arg_type = First; + using arg_type = First; }; template typename lambda_traits::arg_type* lambda_arg_helper(T); - // + // // // List manipulation Utilities - // + // // template constexpr auto list_remove_pointer(const camp::list&){ return camp::list::type>...>{}; } - + template constexpr auto list_add_lvalue_ref(const camp::list&){ return camp::list::type...>{}; @@ -276,7 +294,7 @@ namespace expt template constexpr concepts::enable_if> check_invocable(LAMBDA&&, const camp::list&) { #if !defined(RAJA_ENABLE_HIP) - static_assert(is_invocable::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); + static_assert(is_invocable::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); #endif } @@ -284,7 +302,7 @@ namespace expt template - constexpr + constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp) { @@ -299,7 +317,7 @@ namespace expt detail::check_invocable(std::forward(l), expected_arg_type_list{}); } //=========================================================================== - + //=========================================================================== diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp index 4edf645ed3..a3edc3296a 100644 --- a/include/RAJA/policy/cuda/params/kernel_name.hpp +++ b/include/RAJA/policy/cuda/params/kernel_name.hpp @@ -16,7 +16,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_cuda_policy > init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &) { -#if defined(RAJA_ENABLE_NV_TOOLS_EXT) +#if defined(RAJA_ENABLE_NV_TOOLS_EXT) && !defined(RAJA_ENABLE_CALIPER) nvtxRangePush(kn.name); #else RAJA_UNUSED_VAR(kn); @@ -34,7 +34,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_cuda_policy > resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &) { -#if defined(RAJA_ENABLE_NV_TOOLS_EXT) +#if defined(RAJA_ENABLE_NV_TOOLS_EXT) && !defined(RAJA_ENABLE_CALIPER) nvtxRangePop(); #endif } diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp index 30269f8406..8a001c612a 100644 --- a/include/RAJA/policy/hip/params/kernel_name.hpp +++ b/include/RAJA/policy/hip/params/kernel_name.hpp @@ -20,7 +20,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_hip_policy > init(KernelName& kn, const RAJA::hip::detail::hipInfo &) { -#if defined(RAJA_ENABLE_ROCTX) +#if defined(RAJA_ENABLE_ROCTX) && !defined(RAJA_ENABLE_CALIPER) roctxRangePush(kn.name); #else RAJA_UNUSED_VAR(kn); @@ -38,7 +38,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_hip_policy > resolve(KernelName&, const RAJA::hip::detail::hipInfo &) { -#if defined(RAJA_ENABLE_ROCTX) +#if defined(RAJA_ENABLE_ROCTX) && !defined(RAJA_ENABLE_CALIPER) roctxRangePop(); #endif } diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp index 996836e397..2bc7dc90f4 100644 --- a/include/RAJA/util/PluginContext.hpp +++ b/include/RAJA/util/PluginContext.hpp @@ -8,6 +8,8 @@ #ifndef RAJA_plugin_context_HPP #define RAJA_plugin_context_HPP +#include + #include "RAJA/policy/PolicyBase.hpp" #include "RAJA/internal/get_platform.hpp" @@ -18,10 +20,11 @@ class KokkosPluginLoader; struct PluginContext { public: - PluginContext(const Platform p) : - platform(p) {} + PluginContext(const Platform p, const std::string *name = nullptr) : + platform(p), kernel_name(name) {} - Platform platform; + Platform platform; + const std::string *kernel_name; private: mutable uint64_t kID; @@ -30,9 +33,9 @@ struct PluginContext { }; template -PluginContext make_context() +PluginContext make_context(const std::string *name=nullptr) { - return PluginContext{detail::get_platform::value}; + return PluginContext{detail::get_platform::value, name}; } } // closing brace for util namespace diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp index 91ab75dbab..f4f377ae43 100644 --- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp +++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceMin.hpp @@ -59,9 +59,9 @@ void LaunchParamExptReduceMinBasicTestImpl(const SEG_TYPE& seg, RAJA::launch (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), - "LaunchMinBasicTest", RAJA::expt::Reduce(&mininit), RAJA::expt::Reduce(&min), + RAJA::expt::KernelName("LaunchMinBasicTest"), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_MIN &_mininit, REF_MIN &_min) { RAJA::loop(ctx, seg, [&](IDX_TYPE idx) { diff --git a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp index f6200628cf..f15bdc53cd 100644 --- a/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp +++ b/test/functional/launch/reduce-params/tests/test-launch-basic-param-expt-ReduceSum.hpp @@ -57,9 +57,9 @@ void LaunchParamExptReduceSumBasicTestImpl(const SEG_TYPE& seg, RAJA::launch (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)), - "LaunchSumBasicTest", RAJA::expt::Reduce(&sum), RAJA::expt::Reduce(&sum2), + RAJA::expt::KernelName("LaunchSumBasicTest"), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx, REF_SUM &_sum, REF_SUM &_sum2) { RAJA::loop(ctx, seg, [&](IDX_TYPE idx) {