diff --git a/examples/plugin/CMakeLists.txt b/examples/plugin/CMakeLists.txt index 4a2c3ac37e..d4f0da8c5d 100644 --- a/examples/plugin/CMakeLists.txt +++ b/examples/plugin/CMakeLists.txt @@ -7,8 +7,8 @@ if(RAJA_ENABLE_CALIPER) raja_add_executable( - NAME raja-caliper - SOURCES raja-caliper.cpp caliper-plugin.cpp) + NAME raja-forall-caliper + SOURCES raja-forall-caliper.cpp caliper-plugin.cpp) raja_add_executable( NAME raja-launch-caliper SOURCES raja-launch-caliper.cpp caliper-plugin.cpp) diff --git a/examples/plugin/raja-caliper.cpp b/examples/plugin/raja-forall-caliper.cpp similarity index 94% rename from examples/plugin/raja-caliper.cpp rename to examples/plugin/raja-forall-caliper.cpp index 4b686bff79..3f10806cfa 100644 --- a/examples/plugin/raja-caliper.cpp +++ b/examples/plugin/raja-forall-caliper.cpp @@ -157,7 +157,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memcpy( a, a0, N * sizeof(double) ); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { + RAJA::forall + (RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("CALI: RAJA OpenMP daxpy Kernel"), + [=] (int i) { a[i] += b[i] * c; }); @@ -180,7 +183,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); - RAJA::forall>(RAJA::RangeSegment(0, N), + RAJA::forall> + (RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("CALI: RAJA CUDA daxpy Kernel"), [=] RAJA_DEVICE (int i) { a[i] += b[i] * c; }); @@ -210,7 +215,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice )); - RAJA::forall>(RAJA::RangeSegment(0, N), + RAJA::forall> + (RAJA::RangeSegment(0, N), + RAJA::expt::KernelName("CALI: RAJA HIP daxpy Kernel"), [=] RAJA_DEVICE (int i) { a[i] += b[i] * c; }); diff --git a/examples/plugin/raja-launch-caliper.cpp b/examples/plugin/raja-launch-caliper.cpp index fc664bd8eb..2736844ce6 100644 --- a/examples/plugin/raja-launch-caliper.cpp +++ b/examples/plugin/raja-launch-caliper.cpp @@ -17,9 +17,6 @@ * RAJA Caliper integration with launch */ -using launch_policy = RAJA::LaunchPolicy; -using loop_policy = RAJA::LoopPolicy; - // // Functions for checking and printing results // @@ -39,6 +36,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // const int N = 1000000; +// +// Define compute grid +// + const int Nthreads = 256; + const int Nteams = (N-1)/Nthreads + 1; + // // Allocate and initialize vector data. // @@ -71,6 +74,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memcpy( a, a0, N * sizeof(double) ); { + timer.reset(); timer.start(); CALI_CXX_MARK_SCOPE("CALI: C-version elapsed time"); for (int i = 0; i < N; ++i) { @@ -92,19 +96,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memcpy( a, a0, N * sizeof(double) ); { + using seq_launch_policy = RAJA::LaunchPolicy; + using seq_loop_policy = RAJA::LoopPolicy; + timer.reset(); timer.start(); - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(), RAJA::Threads()), - RAJA::expt::KernelName("CALI: launch kernel"), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)), + RAJA::expt::KernelName("CALI: launch kernel"), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) - { - a[i] += b[i] * c; - }); + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) + { + a[i] += b[i] * c; + }); - }); + }); timer.stop(); RAJA::Timer::ElapsedType etime = timer.elapsed(); std::cout << "C-version elapsed time : " << etime << " seconds" << std::endl; @@ -119,6 +126,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // RAJA CUDA parallel GPU version (256 threads per thread block). // std::cout << "\n Running RAJA CUDA daxpy...\n"; + using cuda_launch_policy = RAJA::LaunchPolicy; + using cuda_loop_policy = RAJA::LoopPolicy; a = 0; b = 0; cudaErrchk(cudaMalloc( (void**)&a, N * sizeof(double) )); @@ -127,10 +136,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) cudaErrchk(cudaMemcpy( a, a0, N * sizeof(double), cudaMemcpyHostToDevice )); cudaErrchk(cudaMemcpy( b, tb, N * sizeof(double), cudaMemcpyHostToDevice )); - //RAJA::forall>(RAJA::RangeSegment(0, N), - //[=] RAJA_DEVICE (int i) { - //a[i] += b[i] * c; - //}); + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)), + RAJA::expt::KernelName("CALI: launch Cuda kernel"), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) + { + a[i] += b[i] * c; + }); + + }); cudaErrchk(cudaMemcpy( ta, a, N * sizeof(double), cudaMemcpyDeviceToHost )); @@ -149,6 +165,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // RAJA HIP parallel GPU version (256 threads per thread block). // std::cout << "\n Running RAJA HIP daxpy...\n"; + using hip_launch_policy = RAJA::LaunchPolicy; + using hip_loop_policy = RAJA::LoopPolicy; a = 0; b = 0; hipErrchk(hipMalloc( (void**)&a, N * sizeof(double) )); @@ -157,10 +175,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( a, a0, N * sizeof(double), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( b, tb, N * sizeof(double), hipMemcpyHostToDevice )); - //RAJA::forall>(RAJA::RangeSegment(0, N), - //[=] RAJA_DEVICE (int i) { - //a[i] += b[i] * c; - //}); + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(Nteams), RAJA::Threads(Nthreads)), + RAJA::expt::KernelName("CALI: launch Cuda kernel"), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) + { + a[i] += b[i] * c; + }); + + }); hipErrchk(hipMemcpy( ta, a, N * sizeof(double), hipMemcpyDeviceToHost ));