From 1ecbc6c48ea3683ab53c3851d82b6819f657bf3b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 08:58:33 -0600 Subject: [PATCH 01/10] Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (Twostage) - Note: everything except for KokkosGraph and sptrsv runs on streams. --- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 207 +++++++++++------- sparse/src/KokkosKernels_Handle.hpp | 28 ++- sparse/src/KokkosSparse_gauss_seidel.hpp | 96 ++++---- .../src/KokkosSparse_gauss_seidel_handle.hpp | 15 +- 4 files changed, 202 insertions(+), 144 deletions(-) diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index f1f7a0e6cd..f4c2ed3662 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -576,10 +576,11 @@ class TwostageGaussSeidel { tic = timer.seconds(); #endif auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); bool two_stage = gsHandle->isTwoStage(); bool compact_form = gsHandle->isCompactForm(); GSDirection direction = gsHandle->getSweepDirection(); - using GS_Functor_t = + using TSGS_Functor_t = TwostageGaussSeidel_functor; // count nnz in local L & U matrices (rowmap_viewL/rowmap_viewU stores @@ -587,28 +588,29 @@ class TwostageGaussSeidel { ordinal_t nnzA = column_view.extent(0); ordinal_t nnzL = 0; // lower-part of diagonal block ordinal_t nnzU = 0; // upper-part of diagonal block - row_map_view_t rowmap_viewL("row_mapL", + row_map_view_t rowmap_viewL(Kokkos::view_alloc(my_exec_space, "row_mapL"), num_rows + 1); // lower-part of diagonal block - row_map_view_t rowmap_viewU("row_mapU", + row_map_view_t rowmap_viewU(Kokkos::view_alloc(my_exec_space, "row_mapU"), num_rows + 1); // upper-part of diagonal block - row_map_view_t rowmap_viewLa("row_mapLa", + row_map_view_t rowmap_viewLa(Kokkos::view_alloc(my_exec_space, "row_mapLa"), num_rows + 1); // complement of U+D - row_map_view_t rowmap_viewUa("row_mapUa", + row_map_view_t rowmap_viewUa(Kokkos::view_alloc(my_exec_space, "row_mapUa"), num_rows + 1); // complement of L+D if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( - "nnzL", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, - column_view, rowmap_viewL, rowmap_viewUa), + "nnzL", range_policy(my_exec_space, 0, num_rows), + TSGS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, + column_view, rowmap_viewL, rowmap_viewUa), nnzL); } + // TODO: continue if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( "nnzU", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, - column_view, rowmap_viewU, rowmap_viewLa), + TSGS_Functor_t(two_stage, compact_form, num_rows, rowmap_view, + column_view, rowmap_viewU, rowmap_viewLa), nnzU); } ordinal_t nnzLa = 0; // complement of U+D @@ -633,19 +635,19 @@ class TwostageGaussSeidel { // shift ptr so that it now contains offsets (combine it with the previous // functor calls?) if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewL); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewL); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewLa); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewLa); } } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewU); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewU); if (compact_form) { - KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( - 1 + num_rows, rowmap_viewUa); + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum( + my_exec_space, 1 + num_rows, rowmap_viewUa); } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -657,31 +659,48 @@ class TwostageGaussSeidel { #endif // allocate memory to store local D values_view_t viewD( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "diags"), + num_rows); // allocate memory to store local L entries_view_t column_viewL( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesL"), nnzL); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesL"), + nnzL); values_view_t values_viewL( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesL"), nnzL); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesL"), + nnzL); // allocate memory to store local U entries_view_t column_viewU( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesU"), nnzU); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesU"), + nnzU); values_view_t values_viewU( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesU"), nnzU); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesU"), + nnzU); // allocate memory to store complement of U+D entries_view_t column_viewLa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesLa"), nnzLa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesLa"), + nnzLa); values_view_t values_viewLa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesLa"), nnzLa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesLa"), + nnzLa); // allocate memory to store complement of L+D entries_view_t column_viewUa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesUa"), nnzUa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "entriesUa"), + nnzUa); values_view_t values_viewUa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesUa"), nnzUa); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "valuesUa"), + nnzUa); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds(); @@ -694,8 +713,8 @@ class TwostageGaussSeidel { // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1}) using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "entriesLU", range_policy(0, num_rows), - GS_Functor_t( + "entriesLU", range_policy(my_exec_space, 0, num_rows), + TSGS_Functor_t( two_stage, compact_form, num_rows, rowmap_view, column_view, rowmap_viewL, column_viewL, rowmap_viewU, column_viewU, // @@ -709,6 +728,7 @@ class TwostageGaussSeidel { timer.reset(); #endif + my_exec_space.fence(); // Wait for non-default stream to finish work // construct CrsMat with them graph_t graphL(column_viewL, rowmap_viewL); graph_t graphU(column_viewU, rowmap_viewU); @@ -732,7 +752,9 @@ class TwostageGaussSeidel { gsHandle->setUa(crsmatUa); values_view_t viewDa( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "diags"), num_rows); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "diags"), + num_rows); gsHandle->setDa(viewDa); } @@ -748,6 +770,8 @@ class TwostageGaussSeidel { crsmatL.graph.entries); sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, crsmatU.graph.entries); + Kokkos::fence(); // Ensure writes land before launching more work on + // non-default streams, TODO: remove } } } @@ -762,13 +786,14 @@ class TwostageGaussSeidel { Kokkos::fence(); timer.reset(); #endif - using GS_Functor_t = + using TSGS_Functor_t = TwostageGaussSeidel_functor; - auto *gsHandle = get_gs_handle(); - bool two_stage = gsHandle->isTwoStage(); - bool compact_form = gsHandle->isCompactForm(); + auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); + bool two_stage = gsHandle->isTwoStage(); + bool compact_form = gsHandle->isCompactForm(); // load local D from handle auto viewD = gsHandle->getD(); @@ -799,12 +824,12 @@ class TwostageGaussSeidel { // extract local L, D & U matrices using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "valueLU", range_policy(0, num_rows), - GS_Functor_t(two_stage, compact_form, diagos_given, num_rows, - rowmap_view, column_view, values_view, d_invert_view, - rowmap_viewL, values_viewL, viewD, rowmap_viewU, - values_viewU, rowmap_viewLa, values_viewLa, viewDa, - rowmap_viewUa, values_viewUa)); + "valueLU", range_policy(my_exec_space, 0, num_rows), + TSGS_Functor_t(two_stage, compact_form, diagos_given, num_rows, + rowmap_view, column_view, values_view, d_invert_view, + rowmap_viewL, values_viewL, viewD, rowmap_viewU, + values_viewU, rowmap_viewLa, values_viewLa, viewDa, + rowmap_viewUa, values_viewUa)); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds(); @@ -813,6 +838,8 @@ class TwostageGaussSeidel { timer.reset(); #endif + my_exec_space.fence(); // Wait for non-default stream work to finish before + // sptrsv, TODO: remove if (!(gsHandle->isTwoStage())) { using namespace KokkosSparse::Experimental; auto sptrsv_algo = @@ -834,6 +861,8 @@ class TwostageGaussSeidel { crsmatL.graph.entries, values_viewL); sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, crsmatU.graph.entries, values_viewU); + Kokkos::fence(); // Wait for sptrsv to finish before launching work on + // a stream, TODO: remove } } } @@ -857,10 +886,11 @@ class TwostageGaussSeidel { #endif // - auto *gsHandle = get_gs_handle(); - bool two_stage = gsHandle->isTwoStage(); - bool compact_form = gsHandle->isCompactForm(); - scalar_t gamma = gsHandle->getInnerDampFactor(); + auto *gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); + bool two_stage = gsHandle->isTwoStage(); + bool compact_form = gsHandle->isCompactForm(); + scalar_t gamma = gsHandle->getInnerDampFactor(); GSDirection direction = gsHandle->getSweepDirection(); if (apply_forward && apply_backward) { @@ -883,7 +913,7 @@ class TwostageGaussSeidel { auto crsmatUa = gsHandle->getUa(); // complement of U+D (used only for compact form) - // wratp A into crsmat + // wrap A into crsmat input_crsmat_t crsmatA("A", num_rows, num_cols, values_view.extent(0), values_view, rowmap_view, column_view); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -916,36 +946,36 @@ class TwostageGaussSeidel { NumSweeps *= 2; } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector( - nrhs, localX); + KokkosKernels::Impl::zero_vector(my_exec_space, nrhs, localX); } for (int sweep = 0; sweep < NumSweeps; ++sweep) { bool forward_sweep = (direction == GS_FORWARD || (direction == GS_SYMMETRIC && sweep % 2 == 0)); // compute residual vector - KokkosBlas::scal(localR, one, localB); + KokkosBlas::scal(my_exec_space, localR, one, localB); if (sweep > 0 || !init_zero_x_vector) { if (compact_form) { if (forward_sweep) { // R = B - U*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatUa, localX, one, - localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatUa, + localX, one, localR); } else { // R = B - L*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatLa, localX, one, - localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatLa, + localX, one, localR); } if (omega != one) { // R = B - (U + (1-1/omega)D)*x scalar_t omega2 = (one / omega - one); auto localY = Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); - KokkosBlas::mult(zero, localZ, one, localDa, localY); - KokkosBlas::axpy(omega2, localZ, localR); + KokkosBlas::mult(my_exec_space, zero, localZ, one, localDa, localY); + KokkosBlas::axpy(my_exec_space, omega2, localZ, localR); } } else { // not compact_form // R = B - A*x - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, + localX, one, localR); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { auto localRj = @@ -981,8 +1011,11 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); + my_exec_space + .fence(); // Wait for writes to R and Z to land, TODO: remove sptrsv_solve(handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, crsmatL.graph.entries, crsmatL.values, Rj, Zj); + Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove } } else { using namespace KokkosSparse::Experimental; @@ -995,8 +1028,11 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); + my_exec_space + .fence(); // Wait for writes to R and Z to land, TODO: remove sptrsv_solve(handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, crsmatU.graph.entries, crsmatU.values, Rj, Zj); + Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove } } @@ -1005,21 +1041,25 @@ class TwostageGaussSeidel { Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); if (compact_form) { // Y = omega * Z - KokkosBlas::scal(localY, one, localZ); + KokkosBlas::scal(my_exec_space, localY, one, localZ); } else { // Y = Y + omega * Z - KokkosBlas::axpy(one, localZ, localY); + KokkosBlas::axpy(my_exec_space, one, localZ, localY); } } else { // ====== inner Jacobi-Richardson ===== #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS // compute initial residual norm // > compute RHS for the inner loop, R = B - A*x - internal_vector_view_t tempR("tempR", num_rows, 1); - KokkosBlas::scal(tempR, one, localB); - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, tempR); + internal_vector_view_t tempR( + Kokkos::view_alloc(my_exec_space, std::string("tempR")), num_rows, + 1); + KokkosBlas::scal(my_exec_space, tempR, one, localB); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX, + one, tempR); // > initial vector for the inner loop is zero - Kokkos::deep_copy(localZ, zero); + Kokkos::deep_copy(my_exec_space, localZ, zero); + my_exec_space.fence(); using Norm_Functor_t = TwostageGaussSeidel_functor; @@ -1027,7 +1067,7 @@ class TwostageGaussSeidel { { mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localZ, tempR), normR); @@ -1042,23 +1082,23 @@ class TwostageGaussSeidel { // row-scale: (D^{-1}*L)*Y = D^{-1}*B // compute Z := D^{-1}*R - KokkosBlas::mult(zero, localZ, one, localD, localR); + KokkosBlas::mult(my_exec_space, zero, localZ, one, localD, localR); // apply inner damping factor, if not one if (gamma != one) { // Z = gamma * Z - KokkosBlas::scal(localZ, gamma, localZ); + KokkosBlas::scal(my_exec_space, localZ, gamma, localZ); } } else { // copy to localT (workspace used to save D^{-1}*R for JR iteration) - KokkosBlas::mult(zero, localT, one, localD, localR); + KokkosBlas::mult(my_exec_space, zero, localT, one, localD, localR); // initialize Jacobi-Richardson (using R as workspace for JR // iteration) - KokkosBlas::scal(localR, one, localT); + KokkosBlas::scal(my_exec_space, localR, one, localT); // apply inner damping factor, if not one if (gamma != one) { // R = gamma * R - KokkosBlas::scal(localR, gamma, localR); + KokkosBlas::scal(my_exec_space, localR, gamma, localR); } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS @@ -1066,7 +1106,7 @@ class TwostageGaussSeidel { // compute residual norm of the starting vector (D^{-1}R) mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localT, tempR), normR); @@ -1077,34 +1117,34 @@ class TwostageGaussSeidel { for (int ii = 0; ii < NumInnerSweeps; ii++) { // T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U // copy T into Z - KokkosBlas::scal(localZ, one, localT); + KokkosBlas::scal(my_exec_space, localZ, one, localT); if (forward_sweep) { // Z = Z - L*R - KokkosSparse::spmv("N", scalar_t(-omega), crsmatL, localR, one, - localZ); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatL, + localR, one, localZ); } else { // Z = R - U*T - KokkosSparse::spmv("N", scalar_t(-omega), crsmatU, localR, one, - localZ); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-omega), crsmatU, + localR, one, localZ); } // apply inner damping factor, if not one if (gamma != one) { // Z = gamma * Z - KokkosBlas::scal(localZ, gamma, localZ); + KokkosBlas::scal(my_exec_space, localZ, gamma, localZ); // Z = Z + (one - one/gamma) * R scalar_t gamma2 = one - gamma; - KokkosBlas::axpy(gamma2, localR, localZ); + KokkosBlas::axpy(my_exec_space, gamma2, localR, localZ); } if (ii + 1 < NumInnerSweeps) { // reinitialize (R to be Z) - KokkosBlas::scal(localR, one, localZ); + KokkosBlas::scal(my_exec_space, localR, one, localZ); } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { // compute residual norm(r - (L+D)*y) mag_t normR = zero; Kokkos::parallel_reduce( - "normR", range_policy(0, num_rows), + "normR", range_policy(my_exec_space, 0, num_rows), Norm_Functor_t(forward_sweep, num_rows, rowmap_view, column_view, values_view, localD, localZ, tempR), normR); @@ -1119,22 +1159,23 @@ class TwostageGaussSeidel { Kokkos::subview(localX, range_type(0, num_rows), Kokkos::ALL()); if (compact_form) { // Y := omega * z - KokkosBlas::scal(localY, omega, localZ); + KokkosBlas::scal(my_exec_space, localY, omega, localZ); } else { // Y := X + omega * Z - KokkosBlas::axpy(omega, localZ, localY); + KokkosBlas::axpy(my_exec_space, omega, localZ, localY); } } // end of inner GS sweep } // end of outer GS sweep #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS { // R = B - A*x - KokkosBlas::scal(localR, one, localB); - KokkosSparse::spmv("N", scalar_t(-one), crsmatA, localX, one, localR); + KokkosBlas::scal(my_exec_space, localR, one, localB); + KokkosSparse::spmv(my_exec_space, "N", scalar_t(-one), crsmatA, localX, + one, localR); auto localRj = Kokkos::subview(localR, Kokkos::ALL(), range_type(0, 1)); single_vector_view_t Rj(localRj.data(), num_rows); - std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2(Rj) - << std::endl; + std::cout << "norm(GS)-" << NumSweeps << " " + << KokkosBlas::nrm2(my_exec_space, Rj) << std::endl; } #endif } diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 680045823e..2c52872c96 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -609,15 +609,20 @@ class KokkosKernelsHandle { * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: - * - * KokkosSpace::GS_DEFAULT PointGaussSeidel - * KokkosSpace::GS_PERMUTED ?? - * KokkosSpace::GS_TEAM ?? - * KokkosSpace::GS_CLUSTER ?? - * KokkosSpace::GS_TWOSTAGE ?? + * + * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. + * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. + * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. + * KokkosSpace::GS_CLUSTER Uses independent clusters of nodes in the graph. Within a cluster, x is updated sequentially. + * For more information, see: https://arxiv.org/pdf/2204.02934.pdf. + * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. + * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * - * KokkosGraph::COLORING_DEFAULT ?? + * + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array @@ -753,8 +758,11 @@ class KokkosKernelsHandle { * KokkosSparse::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * - * KokkosGraph::COLORING_DEFAULT ?? + * + * KokkosGraph::COLORING_DEFAULT Depends on execution space: + * COLORING_SERIAL on Kokkos::Serial; + * COLORING_EB on GPUs; + * COLORING_VBBIT on Kokkos::Sycl or elsewhere. * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 036fe1b119..4e362f2781 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,13 +29,13 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -112,7 +112,7 @@ void gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param handle KernelHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -141,7 +141,7 @@ void gauss_seidel_symbolic(KernelHandle *handle, /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param handle KernelHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -173,15 +173,15 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -279,8 +279,8 @@ void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -313,16 +313,16 @@ void gauss_seidel_numeric(KernelHandle *handle, /// numeric values). This version accepts the matrix's inverse diagonal from the /// user. /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param space The execution space instance this kernel will be run on. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param space The execution space instance this kernel will be run on +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -427,8 +427,8 @@ void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided -/// inverse diagonal must share this type. -/// @param handle KernelHandle instance +/// inverse diagonal must share this type +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -468,7 +468,7 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param handle handle A KokkosKernelsHandle instance +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -504,7 +504,7 @@ void block_gauss_seidel_numeric( /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -512,12 +512,12 @@ void block_gauss_seidel_numeric( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle handle A KokkosKernelsHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -680,8 +680,8 @@ void symmetric_gauss_seidel_apply( /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle handle A KokkosKernelsHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -727,10 +727,10 @@ void symmetric_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle handle A KokkosKernelsHandle instance. +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -793,12 +793,12 @@ void symmetric_block_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle KernelHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -960,10 +960,10 @@ void forward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1008,10 +1008,10 @@ void forward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block @@ -1067,7 +1067,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// -/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -1075,12 +1075,12 @@ void forward_sweep_block_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. +/// rank-1 or rank-2 View /// @param space The execution space instance this kernel will be run -/// on. NOTE: Currently only used for GS_DEFAULT. -/// @param handle KernelHandle instance +/// on. NOTE: Currently only used for GS_DEFAULT and GS_TWOSTAGE +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1242,10 +1242,10 @@ void backward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param row_map The matrix's rowmap @@ -1290,10 +1290,10 @@ void backward_sweep_gauss_seidel_apply( /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type /// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. -/// May be rank-1 or rank-2 View. +/// May be rank-1 or rank-2 View /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be -/// rank-1 or rank-2 View. -/// @param handle KernelHandle instance +/// rank-1 or rank-2 View +/// @param handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix /// @param block_size The number of degrees of freedom per block diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 624382ec5b..270fb607bd 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -767,9 +767,18 @@ class TwoStageGaussSeidelHandle void initVectors(int nrows_, int nrhs_) { if (this->nrows != nrows_ || this->nrhs != nrhs_) { - this->localR = vector_view_t("temp", nrows_, nrhs_); - this->localT = vector_view_t("temp", nrows_, nrhs_); - this->localZ = vector_view_t("temp", nrows_, nrhs_); + vector_view_t r( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localR = r; + vector_view_t t( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localT = t; + vector_view_t z( + Kokkos::view_alloc(this->execution_space, std::string("temp")), + nrows_, nrhs_); + this->localZ = z; this->nrows = nrows_; this->nrhs = nrhs_; } From 7408367e1fd907e8d1d76f2b363b583d6bf0db80 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 12:07:41 -0600 Subject: [PATCH 02/10] sparse/unit_test: Test twostage with streams --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 58 +++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 48c7d41a91..65138fe959 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -756,6 +756,9 @@ void test_gauss_seidel_streams_rank1( execution_space(), std::vector(nstreams, 1)); std::vector kh_v(nstreams); + std::vector kh_psgs_v(nstreams); + std::vector kh_tsgs_v(nstreams); + std::vector kh_tsgs_classic_v(nstreams); std::vector input_mat_v(nstreams); std::vector solution_x_v(nstreams); std::vector x_vector_v(nstreams); @@ -764,6 +767,8 @@ void test_gauss_seidel_streams_rank1( const scalar_t one = Kokkos::ArithTraits::one(); const scalar_t zero = Kokkos::ArithTraits::zero(); + // two-stage with SpTRSV supports only omega = one + const double omega_tsgs_classic = one; for (int i = 0; i < nstreams; i++) { input_mat_v[i] = @@ -792,8 +797,18 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); x_vector_v[i] = x_vector_tmp; - kh_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. - kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); + kh_psgs_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_tsgs_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_tsgs_classic_v[i] = + KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_psgs_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, + coloringAlgo); + kh_tsgs_v[i].create_gs_handle(instances[i], nstreams, GS_TWOSTAGE, + coloringAlgo); + kh_tsgs_v[i].set_gs_twostage(true, input_mat_v[i].numRows()); + kh_tsgs_classic_v[i].create_gs_handle(instances[i], nstreams, GS_TWOSTAGE, + coloringAlgo); + kh_tsgs_classic_v[i].set_gs_twostage(false, input_mat_v[i].numRows()); } int apply_count = 3; // test symmetric, forward, backward @@ -802,7 +817,7 @@ void test_gauss_seidel_streams_rank1( for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(instances[i], x_vector_v[i], zero); - run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, + run_gauss_seidel_streams(instances, kh_psgs_v, input_mat_v, x_vector_v, y_vector_v, symmetric, m_omega, apply_type, nstreams); for (int i = 0; i < nstreams; i++) { @@ -814,7 +829,42 @@ void test_gauss_seidel_streams_rank1( } } - for (int i = 0; i < nstreams; i++) kh_v[i].destroy_gs_handle(); + //*** Two-stage version **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + run_gauss_seidel_streams(instances, kh_tsgs_v, input_mat_v, x_vector_v, + y_vector_v, symmetric, m_omega, apply_type, + nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + //*** Two-stage version (classic) **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + run_gauss_seidel_streams(instances, kh_tsgs_classic_v, input_mat_v, + x_vector_v, y_vector_v, symmetric, + omega_tsgs_classic, apply_type, nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + + for (int i = 0; i < nstreams; i++) { + kh_psgs_v[i].destroy_gs_handle(); + kh_tsgs_v[i].destroy_gs_handle(); + kh_tsgs_classic_v[i].destroy_gs_handle(); + } } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From f4e3df5de9837af43a60e02f0342349f0d4d9799 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 26 Sep 2023 16:56:00 -0600 Subject: [PATCH 03/10] Update omega_tsgs_classic type --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 65138fe959..349be25931 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -768,7 +768,7 @@ void test_gauss_seidel_streams_rank1( const scalar_t one = Kokkos::ArithTraits::one(); const scalar_t zero = Kokkos::ArithTraits::zero(); // two-stage with SpTRSV supports only omega = one - const double omega_tsgs_classic = one; + auto omega_tsgs_classic = one; for (int i = 0; i < nstreams; i++) { input_mat_v[i] = From 6f510b37608d3570a8b25176f27a1bad564e655f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 27 Sep 2023 08:45:14 -0600 Subject: [PATCH 04/10] Move sptrsv to streams --- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 40 +++++++------------ 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index f4c2ed3662..12e0e1d1ee 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -604,7 +604,6 @@ class TwostageGaussSeidel { column_view, rowmap_viewL, rowmap_viewUa), nnzL); } - // TODO: continue if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy; Kokkos::parallel_reduce( @@ -766,12 +765,10 @@ class TwostageGaussSeidel { if (sptrsv_algo != SPTRSVAlgorithm::SPTRSV_CUSPARSE) { // symbolic with CuSparse needs // values - sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL, - crsmatL.graph.entries); - sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, - crsmatU.graph.entries); - Kokkos::fence(); // Ensure writes land before launching more work on - // non-default streams, TODO: remove + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvL_handle(), + rowmap_viewL, crsmatL.graph.entries); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvU_handle(), + rowmap_viewU, crsmatU.graph.entries); } } } @@ -837,9 +834,6 @@ class TwostageGaussSeidel { << "TWO-STAGE GS::NUMERIC::INSERT LU TIME : " << tic << std::endl; timer.reset(); #endif - - my_exec_space.fence(); // Wait for non-default stream work to finish before - // sptrsv, TODO: remove if (!(gsHandle->isTwoStage())) { using namespace KokkosSparse::Experimental; auto sptrsv_algo = @@ -857,12 +851,10 @@ class TwostageGaussSeidel { rowmap_viewU, column_viewU, values_viewU); // now do symbolic - sptrsv_symbolic(handle->get_gs_sptrsvL_handle(), rowmap_viewL, - crsmatL.graph.entries, values_viewL); - sptrsv_symbolic(handle->get_gs_sptrsvU_handle(), rowmap_viewU, - crsmatU.graph.entries, values_viewU); - Kokkos::fence(); // Wait for sptrsv to finish before launching work on - // a stream, TODO: remove + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvL_handle(), + rowmap_viewL, crsmatL.graph.entries, values_viewL); + sptrsv_symbolic(my_exec_space, handle->get_gs_sptrsvU_handle(), + rowmap_viewU, crsmatU.graph.entries, values_viewU); } } } @@ -1011,11 +1003,9 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); - my_exec_space - .fence(); // Wait for writes to R and Z to land, TODO: remove - sptrsv_solve(handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, - crsmatL.graph.entries, crsmatL.values, Rj, Zj); - Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove + sptrsv_solve(my_exec_space, handle->get_gs_sptrsvL_handle(), + crsmatL.graph.row_map, crsmatL.graph.entries, + crsmatL.values, Rj, Zj); } } else { using namespace KokkosSparse::Experimental; @@ -1028,11 +1018,9 @@ class TwostageGaussSeidel { Kokkos::subview(localZ, Kokkos::ALL(), range_type(j, j + 1)); single_vector_view_t Rj(localRj.data(), num_rows); single_vector_view_t Zj(localZj.data(), num_rows); - my_exec_space - .fence(); // Wait for writes to R and Z to land, TODO: remove - sptrsv_solve(handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, - crsmatU.graph.entries, crsmatU.values, Rj, Zj); - Kokkos::fence(); // TODO: call sptrsv_solve on stream and remove + sptrsv_solve(my_exec_space, handle->get_gs_sptrsvU_handle(), + crsmatU.graph.row_map, crsmatU.graph.entries, + crsmatU.values, Rj, Zj); } } From 7505956d4d50f9c71c0492c967e65926e4883b15 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 27 Sep 2023 09:59:19 -0600 Subject: [PATCH 05/10] Add lightweighthint --- .../impl/KokkosSparse_twostage_gauss_seidel_impl.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 12e0e1d1ee..05c109b7db 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -712,7 +712,10 @@ class TwostageGaussSeidel { // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1}) using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "entriesLU", range_policy(my_exec_space, 0, num_rows), + "entriesLU", + Kokkos::Experimental::require( + range_policy(my_exec_space, 0, num_rows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), TSGS_Functor_t( two_stage, compact_form, num_rows, rowmap_view, column_view, rowmap_viewL, column_viewL, rowmap_viewU, column_viewU, @@ -821,7 +824,10 @@ class TwostageGaussSeidel { // extract local L, D & U matrices using range_policy = Kokkos::RangePolicy; Kokkos::parallel_for( - "valueLU", range_policy(my_exec_space, 0, num_rows), + "valueLU", + Kokkos::Experimental::require( + range_policy(my_exec_space, 0, num_rows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), TSGS_Functor_t(two_stage, compact_form, diagos_given, num_rows, rowmap_view, column_view, values_view, d_invert_view, rowmap_viewL, values_viewL, viewD, rowmap_viewU, From 35b84744c9a88c4349bd7d03307e205761e578b4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Oct 2023 12:27:10 -0600 Subject: [PATCH 06/10] update symbolic check_count --- sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 36ea2d9df8..6c2782b676 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -274,7 +274,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, Kokkos::parallel_reduce( "check_count host", Kokkos::RangePolicy( - 0, nodes_per_level.extent(0)), + space, 0, nodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += nodes_per_level(i); }, @@ -285,8 +285,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", - Kokkos::RangePolicy( - 0, dnodes_per_level.extent(0)), + Kokkos::RangePolicy(0, dnodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += dnodes_per_level(i); }, @@ -740,8 +739,8 @@ void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle, check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", - Kokkos::RangePolicy( - 0, dnodes_per_level.extent(0)), + Kokkos::RangePolicy(space, 0, + dnodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += dnodes_per_level(i); }, From d11fd83a678bcba2fb0a337ec4474edac31b7032 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 28 Mar 2024 13:57:38 -0600 Subject: [PATCH 07/10] BLAS - Axpby: only print debug info at high debug level Currently Axpby is flooding the output so badle that it is hard to find/understand any output from the CI when it turns on debugging information. If someone really wants that thing to print everywhere they can hard code the macro themselve... --- .../KokkosBlas1_axpby_unification_attempt_traits.hpp | 2 +- blas/unit_test/Test_Blas1_axpby_unification.hpp | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 9d200e892d..5241481098 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -768,7 +768,7 @@ struct AxpbyUnificationAttemptTraits { // ******************************************************************** // Routine to print information on input variables and internal variables // ******************************************************************** -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if (KOKKOSKERNELS_DEBUG_LEVEL > 1) static void printInformation(std::ostream& os, std::string const& headerMsg) { os << headerMsg << ": AV = " << typeid(AV).name() diff --git a/blas/unit_test/Test_Blas1_axpby_unification.hpp b/blas/unit_test/Test_Blas1_axpby_unification.hpp index 6ce7bad0b1..f0abde262b 100644 --- a/blas/unit_test/Test_Blas1_axpby_unification.hpp +++ b/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -54,6 +54,12 @@ #include #include +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#define KOKKOSKERNELS_DEBUG_LEVEL_OLD KOKKOSKERNELS_DEBUG_LEVEL +#undef KOKKOSKERNELS_DEBUG_LEVEL +#define KOKKOSKERNELS_DEBUG_LEVEL 2 +#endif + static constexpr int numVecsAxpbyTest = 15; namespace Test { @@ -2537,6 +2543,12 @@ void impl_test_axpby_mv_unification(int const N, int const K) { } // namespace Test +#if (KOKKOSKERNELS_DEBUG_LEVEL > 1) +#undef KOKKOSKERNELS_DEBUG_LEVEL +#define KOKKOSKERNELS_DEBUG_LEVEL KOKKOSKERNELS_DEBUG_LEVEL_OLD +#undef KOKKOSKERNELS_DEBUG_LEVEL_OLD +#endif + template int test_axpby_unification() { From 923ebf4239f5d85ea2544f5d5e90b628444798bb Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 28 Mar 2024 14:23:11 -0600 Subject: [PATCH 08/10] Sparse - twostage GS: implementing Brian's review More consistent use of space argument and removing unnecessary fence and object destruction. --- sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp | 4 ++-- sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp | 1 - sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 6 ------ 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 6c2782b676..258720c6a3 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -274,7 +274,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, Kokkos::parallel_reduce( "check_count host", Kokkos::RangePolicy( - space, 0, nodes_per_level.extent(0)), + 0, nodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += nodes_per_level(i); }, @@ -285,7 +285,7 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", - Kokkos::RangePolicy(0, dnodes_per_level.extent(0)), + Kokkos::RangePolicy(space, 0, dnodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += dnodes_per_level(i); }, diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 05c109b7db..3fddc09175 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -1053,7 +1053,6 @@ class TwostageGaussSeidel { one, tempR); // > initial vector for the inner loop is zero Kokkos::deep_copy(my_exec_space, localZ, zero); - my_exec_space.fence(); using Norm_Functor_t = TwostageGaussSeidel_functor; diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 349be25931..401f0f0c38 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -859,12 +859,6 @@ void test_gauss_seidel_streams_rank1( << "on stream_idx: " << i; } } - - for (int i = 0; i < nstreams; i++) { - kh_psgs_v[i].destroy_gs_handle(); - kh_tsgs_v[i].destroy_gs_handle(); - kh_tsgs_classic_v[i].destroy_gs_handle(); - } } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From e3e11220254520eb430247b96ac1a5cad57b64b7 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 28 Mar 2024 14:27:25 -0600 Subject: [PATCH 09/10] Applying clang-format --- sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 258720c6a3..8c5496a027 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -285,7 +285,8 @@ void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", - Kokkos::RangePolicy(space, 0, dnodes_per_level.extent(0)), + Kokkos::RangePolicy(space, 0, + dnodes_per_level.extent(0)), KOKKOS_LAMBDA(const long i, long& update) { update += dnodes_per_level(i); }, From d52f809acd6127605b08fc488958c9a29c90790c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 28 Mar 2024 15:54:17 -0600 Subject: [PATCH 10/10] BLAS - Axpby: fixing issue with output level --- blas/src/KokkosBlas1_axpby.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 0f95786dfc..5cd03dd7c7 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -73,7 +73,7 @@ void axpby(const execution_space& exec_space, const AV& a, const XMV& X, // Perform compile time checks and run time checks. // ********************************************************************** AxpbyTraits::performChecks(a, X, b, Y); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if (KOKKOSKERNELS_DEBUG_LEVEL > 1) AxpbyTraits::printInformation(std::cout, "axpby(), unif information"); #endif // KOKKOSKERNELS_DEBUG_LEVEL