From b3c4d41ce862a03ea0ee6a102437d12a2d22cedf Mon Sep 17 00:00:00 2001 From: David Cortes Date: Tue, 12 Nov 2024 19:29:04 +0100 Subject: [PATCH 1/6] add spectral decomposition fallback on GPU --- .../dal/algo/linear_regression/test/batch.cpp | 3 +- .../algo/linear_regression/test/fixture.hpp | 78 +++---- .../backend/primitives/lapack/solve_dpc.cpp | 191 +++++++++++++++++- 3 files changed, 229 insertions(+), 43 deletions(-) diff --git a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp index a5585202c31..33f690d2110 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/batch.cpp @@ -51,9 +51,10 @@ TEMPLATE_LIST_TEST_M(lr_batch_test, "LR common flow", "[lr][batch]", lr_types) { } TEMPLATE_LIST_TEST_M(lr_batch_test, "LR with non-PSD matrix", "[lr][batch-nonpsd]", lr_types) { - SKIP_IF(this->non_psd_system_not_supported_on_device()); + SKIP_IF(this->not_float64_friendly()); this->generate(777); + this->run_and_check_linear_indefinite(); this->run_and_check_linear_indefinite_multioutput(); } diff --git a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp index e1e54092552..bfba2abf9e5 100644 --- a/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp +++ b/cpp/oneapi/dal/algo/linear_regression/test/fixture.hpp @@ -65,10 +65,6 @@ class lr_test : public te::crtp_algo_fixture { return static_cast(this); } - bool non_psd_system_not_supported_on_device() { - return this->get_policy().is_gpu(); - } - table compute_responses(const table& beta, const table& bias, const table& data) const { const auto s_count = data.get_row_count(); @@ -299,18 +295,20 @@ class lr_test : public te::crtp_algo_fixture { here is not positive-definite, thus it has an infinite number of possible solutions. The solution here is the one with minimum norm, which is typically more desirable. */ void run_and_check_linear_indefinite(double tol = 1e-3) { - const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, - 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, - 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; - const double y[] = { 0.13632112, 1.53203308, -0.65996941 }; + const auto dtype = + std::is_same::value ? data_type::float64 : data_type::float32; + const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const float_t y[] = { 0.13632112, 1.53203308, -0.65996941 }; auto X_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 5) .copy_data(X, 3, 5) .build(); auto y_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 1) .copy_data(y, 3, 1) @@ -321,20 +319,20 @@ class lr_test : public te::crtp_algo_fixture { const auto coefs = train_res.get_coefficients(); if (desc.get_result_options().test(result_options::intercept)) { - const double expected_beta[] = { 0.27785494, - 0.53011669, - 0.34352259, - 0.40506216, - -1.26026447 }; - const double expected_intercept[] = { 1.24485441 }; + const float_t expected_beta[] = { 0.27785494, + 0.53011669, + 0.34352259, + 0.40506216, + -1.26026447 }; + const float_t expected_intercept[] = { 1.24485441 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 5) .copy_data(expected_beta, 1, 5) .build(); const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 1) .copy_data(expected_intercept, 1, 1) @@ -351,13 +349,13 @@ class lr_test : public te::crtp_algo_fixture { } else { - const double expected_beta[] = { 0.38217445, - 0.2732197, - 1.87135517, - 0.63458468, - -2.08473134 }; + const float_t expected_beta[] = { 0.38217445, + 0.2732197, + 1.87135517, + 0.63458468, + -2.08473134 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 5) .copy_data(expected_beta, 1, 5) @@ -369,19 +367,21 @@ class lr_test : public te::crtp_algo_fixture { } void run_and_check_linear_indefinite_multioutput(double tol = 1e-3) { - const double X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, - 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, - 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; - const double y[] = { 0.13632112, 1.53203308, -0.65996941, - -0.31179486, 0.33776913, -2.2074711 }; + const auto dtype = + std::is_same::value ? data_type::float64 : data_type::float32; + const float_t X[] = { -0.98912135, -0.36778665, 1.28792526, 0.19397442, 0.9202309, + 0.57710379, -0.63646365, 0.54195222, -0.31659545, -0.32238912, + 0.09716732, -1.52593041, 1.1921661, -0.67108968, 1.00026942 }; + const float_t y[] = { 0.13632112, 1.53203308, -0.65996941, + -0.31179486, 0.33776913, -2.2074711 }; auto X_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 5) .copy_data(X, 3, 5) .build(); auto y_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(3, 2) .copy_data(y, 3, 2) @@ -392,19 +392,19 @@ class lr_test : public te::crtp_algo_fixture { const auto coefs = train_res.get_coefficients(); if (desc.get_result_options().test(result_options::intercept)) { - const double expected_beta[] = { + const float_t expected_beta[] = { -0.18692112, -0.20034801, -0.09590892, -0.13672683, 0.56229012, -0.97006008, 1.39413595, 0.49238012, 1.11041239, -0.79213452, }; - const double expected_intercept[] = { -0.48964358, 0.96467681 }; + const float_t expected_intercept[] = { -0.48964358, 0.96467681 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(2, 5) .copy_data(expected_beta, 2, 5) .build(); const auto expected_intercept_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(1, 2) .copy_data(expected_intercept, 1, 2) @@ -421,11 +421,11 @@ class lr_test : public te::crtp_algo_fixture { } else { - const double expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585, - 0.88658098, -0.88921961, 1.19505839, 1.67634561, - 1.2882766, -1.43103981 }; + const float_t expected_beta[] = { -0.22795353, -0.09930168, -0.69685744, -0.22700585, + 0.88658098, -0.88921961, 1.19505839, 1.67634561, + 1.2882766, -1.43103981 }; const auto expected_beta_tbl = oneapi::dal::detail::homogen_table_builder() - .set_data_type(data_type::float64) + .set_data_type(dtype) .set_layout(data_layout::row_major) .allocate(2, 5) .copy_data(expected_beta, 2, 5) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index 223f6753f69..77ffd3e2540 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -14,6 +14,8 @@ * limitations under the License. *******************************************************************************/ +#include + #include "oneapi/dal/backend/primitives/lapack.hpp" #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/backend/primitives/ndindexer.hpp" @@ -59,6 +61,167 @@ inline sycl::event beta_copy_transform(sycl::queue& queue, }); } +/* +This is an adaptation of the CPU version, which can be found in this file: +cpp/daal/src/algorithms/service_kernel_math.h + +It solves a linear system A*x = b +where 'b' might be a matrix (multiple RHS) + +It is intended as a fallback for solving linear regression, where these +matrices are obtained as follows: + A = t(X)*X + b = t(X)*y + +It can handle systems that are not positive semi-definite, so it's used +as a fallback when Cholesky fails or when it involves too small numbers +(which tends to happen when floating point error results in a slightly +positive matrix when it should have zero determinant in theory). +*/ +template +sycl::event solve_spectral_decomposition( + sycl::queue& queue, + ndview& A, // t(X)*X, LHS, gets overwritten + sycl::event& event_A, + ndview& b, // t(X)*y, RHS, solution is outputted here + sycl::event& event_b, + const std::int64_t dim_A, + const std::int64_t nrhs) { + constexpr auto alloc = sycl::usm::alloc::device; + + /* Decompose: A = Q * diag(l) * t(Q) */ + /* Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on */ + auto eigenvalues = array::empty(queue, dim_A * nrhs, alloc); + auto eigenvalues_view = ndview::wrap(eigenvalues); + sycl::event syevd_event = + syevd(queue, dim_A, A, dim_A, eigenvalues_view, { event_A }); + const Float eps = std::numeric_limits::epsilon(); + + /* Discard too small singular values */ + std::int64_t num_discarded; + { + /* This is placed inside a block because the array created here is not used any further */ + auto eigenvalues_cpu = eigenvalues_view.to_host(queue, { syevd_event }); + const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data(); + const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1]; + if (largest_ev <= eps) { + throw std::runtime_error( + "Could not solve linear system. Problem has too small singular values."); + } + const Float component_threshold = eps * largest_ev; + for (num_discarded = 0; num_discarded < dim_A - 1; num_discarded++) { + if (eigenvalues_cpu_ptr[num_discarded] > component_threshold) + break; + } + } + + /* Create the square root of the inverse: Qis = Q * diag(1 / sqrt(l)) */ + std::int64_t num_taken = dim_A - num_discarded; + auto ev_mutable = eigenvalues.get_mutable_data(); + sycl::event inv_sqrt_eigenvalues_event = queue.submit([&](sycl::handler& h) { + h.depends_on(syevd_event); + h.parallel_for(num_taken, [=](const auto& i) { + const std::size_t ix = i + num_discarded; + ev_mutable[ix] = sycl::sqrt(Float(1) / ev_mutable[ix]); + }); + }); + + auto Q_mutable = A.get_mutable_data(); + sycl::event inv_sqrt_eigenvectors_event = queue.submit([&](sycl::handler& h) { + const auto range = oneapi::dal::backend::make_range_2d(num_taken, dim_A); + h.depends_on(inv_sqrt_eigenvalues_event); + h.parallel_for(range, [=](sycl::id<2> id) { + const std::size_t col = id[0] + num_discarded; + const std::size_t row = id[1]; + Q_mutable[row + col * dim_A] *= ev_mutable[col]; + }); + }); + + /* Now calculate the actual solution: Qis * Qis' * B */ + const std::int64_t eigenvectors_offset = num_discarded * dim_A; + if (nrhs == 1) { + sycl::event gemv_right_event = + mkl::blas::column_major::gemv(queue, + mkl::transpose::trans, + dim_A, + num_taken, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + b.get_data(), + 1, + Float(0), + ev_mutable, + 1, + { inv_sqrt_eigenvectors_event, event_b }); + return mkl::blas::column_major::gemv(queue, + mkl::transpose::nontrans, + dim_A, + num_taken, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + ev_mutable, + 1, + Float(0), + b.get_mutable_data(), + 1, + { gemv_right_event }); + } + + else { + sycl::event gemm_right_event = + mkl::blas::column_major::gemm(queue, + mkl::transpose::trans, + mkl::transpose::nontrans, + num_taken, + nrhs, + dim_A, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + b.get_data(), + dim_A, + Float(0), + ev_mutable, + num_taken, + { inv_sqrt_eigenvectors_event, event_b }); + return mkl::blas::column_major::gemm(queue, + mkl::transpose::nontrans, + mkl::transpose::nontrans, + dim_A, + nrhs, + num_taken, + Float(1), + Q_mutable + eigenvectors_offset, + dim_A, + ev_mutable, + num_taken, + Float(0), + b.get_mutable_data(), + dim_A, + { gemm_right_event }); + } +} + +/* Returns the minimum value among entries in the diagonal of a square matrix */ +template +Float diagonal_minimum(sycl::queue& queue, + const Float* Matrix, + const std::int64_t dim_matrix, + sycl::event& event_Matrix) { + constexpr auto alloc = sycl::usm::alloc::device; + auto diag_min_holder = array::empty(queue, 1, alloc); + sycl::event diag_min_event = queue.submit([&](sycl::handler& h) { + auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>()); + h.depends_on(event_Matrix); + h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) { + min_obj.combine(Matrix[i * (dim_matrix + 1)]); + }); + }); + return ndview::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event }); +} + template sycl::event solve_system(sycl::queue& queue, const ndview& xtx, @@ -70,12 +233,34 @@ sycl::event solve_system(sycl::queue& queue, auto [nxty, xty_event] = copy(queue, xty, dependencies); auto [nxtx, xtx_event] = copy(queue, xtx, dependencies); + const std::int64_t dim_xtx = xtx.get_dimension(0); + + sycl::event solution_event; opt_array dummy{}; - auto potrf_event = potrf_factorization(queue, nxtx, dummy, { xtx_event }); - auto potrs_event = potrs_solution(queue, nxtx, nxty, dummy, { potrf_event, xty_event }); + try { + sycl::event potrf_event = potrf_factorization(queue, nxtx, dummy, { xtx_event }); + const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event); + if (diag_min <= 1e-6) + throw mkl::exception(""); + solution_event = potrs_solution(queue, nxtx, nxty, dummy, { potrf_event, xty_event }); + } + catch (mkl::exception& ex) { + const std::int64_t nrhs = nxty.get_dimension(0); + /* Note: this templated version of 'copy' reuses the layout that was specified in the previous copy */ + sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies); + sycl::event xty_event_new = copy(queue, nxty, xty, dependencies); + + solution_event = solve_spectral_decomposition(queue, + nxtx, + xtx_event_new, + nxty, + xty_event_new, + dim_xtx, + nrhs); + } - return beta_copy_transform(queue, nxty, final_xty, { potrs_event }); + return beta_copy_transform(queue, nxty, final_xty, { solution_event }); } #define INSTANTIATE(U, B, F, XL, YL) \ From d528fab78c415ba29aa81aaa49428835f021de4f Mon Sep 17 00:00:00 2001 From: David Cortes Date: Thu, 14 Nov 2024 17:51:06 +0100 Subject: [PATCH 2/6] fix failing min diagonal --- .../backend/primitives/lapack/solve_dpc.cpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index 77ffd3e2540..a0f42ecadbe 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -211,15 +211,18 @@ Float diagonal_minimum(sycl::queue& queue, const std::int64_t dim_matrix, sycl::event& event_Matrix) { constexpr auto alloc = sycl::usm::alloc::device; - auto diag_min_holder = array::empty(queue, 1, alloc); - sycl::event diag_min_event = queue.submit([&](sycl::handler& h) { - auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>()); - h.depends_on(event_Matrix); - h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) { - min_obj.combine(Matrix[i * (dim_matrix + 1)]); - }); - }); - return ndview::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event }); + auto idx_min_holder = array::empty(queue, 1, alloc); + sycl::event diag_min_event = mkl::blas::column_major::iamin( + queue, + dim_matrix, + Matrix, + dim_matrix + 1, + idx_min_holder.get_mutable_data(), + mkl::index_base::zero, + { event_Matrix } + ); + const std::int64_t idx_min = ndview::wrap(idx_min_holder).at_device(queue, 0, { diag_min_event }); + return ndview::wrap(Matrix, dim_matrix * dim_matrix).at_device(queue, idx_min * (dim_matrix + 1), { event_Matrix }); } template From 5255e648bdd0d9e605da55eee9270cb9da5ceeae Mon Sep 17 00:00:00 2001 From: David Cortes Date: Fri, 15 Nov 2024 08:54:58 +0100 Subject: [PATCH 3/6] switch back to sycl reduction for diag min --- .../backend/primitives/lapack/solve_dpc.cpp | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index a0f42ecadbe..305f74a4a67 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -211,18 +211,21 @@ Float diagonal_minimum(sycl::queue& queue, const std::int64_t dim_matrix, sycl::event& event_Matrix) { constexpr auto alloc = sycl::usm::alloc::device; - auto idx_min_holder = array::empty(queue, 1, alloc); - sycl::event diag_min_event = mkl::blas::column_major::iamin( - queue, - dim_matrix, - Matrix, - dim_matrix + 1, - idx_min_holder.get_mutable_data(), - mkl::index_base::zero, - { event_Matrix } - ); - const std::int64_t idx_min = ndview::wrap(idx_min_holder).at_device(queue, 0, { diag_min_event }); - return ndview::wrap(Matrix, dim_matrix * dim_matrix).at_device(queue, idx_min * (dim_matrix + 1), { event_Matrix }); + auto diag_min_holder = array::empty(queue, 1, alloc); + sycl::event diag_min_holder_init = queue.submit([&](sycl::handler& h) { + Float* diag_min_ptr = diag_min_holder.get_mutable_data(); + h.parallel_for(1, [=](const auto& i) { + diag_min_ptr[i] = std::numeric_limits::infinity(); + }); + }); + sycl::event diag_min_event = queue.submit([&](sycl::handler& h) { + auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>()); + h.depends_on({ diag_min_holder_init, event_Matrix }); + h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) { + min_obj.combine(Matrix[i * (dim_matrix + 1)]); + }); + }); + return ndview::wrap(diag_min_holder).at_device(queue, 0, { diag_min_event }); } template @@ -245,10 +248,10 @@ sycl::event solve_system(sycl::queue& queue, sycl::event potrf_event = potrf_factorization(queue, nxtx, dummy, { xtx_event }); const Float diag_min = diagonal_minimum(queue, nxtx.get_data(), dim_xtx, potrf_event); if (diag_min <= 1e-6) - throw mkl::exception(""); + throw mkl::lapack::computation_error("", "", 0); solution_event = potrs_solution(queue, nxtx, nxty, dummy, { potrf_event, xty_event }); } - catch (mkl::exception& ex) { + catch (mkl::lapack::computation_error& ex) { const std::int64_t nrhs = nxty.get_dimension(0); /* Note: this templated version of 'copy' reuses the layout that was specified in the previous copy */ sycl::event xtx_event_new = copy(queue, nxtx, xtx, dependencies); From 141b65b4c2e42aac0d0d27ac4117b3eb9e555b6c Mon Sep 17 00:00:00 2001 From: David Cortes Date: Fri, 15 Nov 2024 09:03:49 +0100 Subject: [PATCH 4/6] try another way --- .../dal/backend/primitives/lapack/solve_dpc.cpp | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index 305f74a4a67..5dc20bfd768 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -212,15 +212,11 @@ Float diagonal_minimum(sycl::queue& queue, sycl::event& event_Matrix) { constexpr auto alloc = sycl::usm::alloc::device; auto diag_min_holder = array::empty(queue, 1, alloc); - sycl::event diag_min_holder_init = queue.submit([&](sycl::handler& h) { - Float* diag_min_ptr = diag_min_holder.get_mutable_data(); - h.parallel_for(1, [=](const auto& i) { - diag_min_ptr[i] = std::numeric_limits::infinity(); - }); - }); sycl::event diag_min_event = queue.submit([&](sycl::handler& h) { - auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), sycl::minimum<>()); - h.depends_on({ diag_min_holder_init, event_Matrix }); + auto min_reduction = sycl::reduction(diag_min_holder.get_mutable_data(), + sycl::minimum<>(), + sycl::property::reduction::initialize_to_identity()); + h.depends_on({ event_Matrix }); h.parallel_for(dim_matrix, min_reduction, [=](const auto& i, auto& min_obj) { min_obj.combine(Matrix[i * (dim_matrix + 1)]); }); From e88ed4717a84281210202343b67952fbdf93cc44 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Fri, 15 Nov 2024 13:09:25 +0100 Subject: [PATCH 5/6] use ndarray instead of array for allocation --- cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index 5dc20bfd768..401a304cfbc 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -91,17 +91,16 @@ sycl::event solve_spectral_decomposition( /* Decompose: A = Q * diag(l) * t(Q) */ /* Note: for NRHS>1, this will overallocate in order to reuse the memory as buffer later on */ - auto eigenvalues = array::empty(queue, dim_A * nrhs, alloc); - auto eigenvalues_view = ndview::wrap(eigenvalues); + auto eigenvalues = ndarray::empty(queue, dim_A * nrhs, alloc); sycl::event syevd_event = - syevd(queue, dim_A, A, dim_A, eigenvalues_view, { event_A }); + syevd(queue, dim_A, A, dim_A, eigenvalues, { event_A }); const Float eps = std::numeric_limits::epsilon(); /* Discard too small singular values */ std::int64_t num_discarded; { /* This is placed inside a block because the array created here is not used any further */ - auto eigenvalues_cpu = eigenvalues_view.to_host(queue, { syevd_event }); + auto eigenvalues_cpu = eigenvalues.to_host(queue, { syevd_event }); const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data(); const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1]; if (largest_ev <= eps) { From a3254527c2d4c5e242823f8f96ae6537cbc0fa27 Mon Sep 17 00:00:00 2001 From: David Cortes Date: Fri, 15 Nov 2024 16:52:21 +0100 Subject: [PATCH 6/6] avoid copying unused entries in array --- cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp index 401a304cfbc..efa497e73fe 100644 --- a/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/lapack/solve_dpc.cpp @@ -100,7 +100,10 @@ sycl::event solve_spectral_decomposition( std::int64_t num_discarded; { /* This is placed inside a block because the array created here is not used any further */ - auto eigenvalues_cpu = eigenvalues.to_host(queue, { syevd_event }); + /* Note: the array for 'eigenvalues' is allocated with size [dimA, nrhs], + but by this point, only 'dimA' elements will have been written to it. */ + auto eigenvalues_cpu = + ndview::wrap(eigenvalues.get_data(), dim_A).to_host(queue, { syevd_event }); const Float* eigenvalues_cpu_ptr = eigenvalues_cpu.get_data(); const Float largest_ev = eigenvalues_cpu_ptr[dim_A - 1]; if (largest_ev <= eps) {