From c76cca8634487434fb49c7d4533bed3d78adbd8a Mon Sep 17 00:00:00 2001 From: Sai Sunku Date: Mon, 27 Jan 2025 17:31:10 +0000 Subject: [PATCH] prov/efa: Only support HMEM in efa-direct if p2p support is available efa-direct provider does not support copy based protocols, so it cannot support HMEM transfers if the HMEM device does not have p2p support. Signed-off-by: Sai Sunku --- prov/efa/src/efa_prov_info.c | 38 +++++++++++++++++++++++---- prov/efa/src/efa_prov_info.h | 2 ++ prov/efa/test/efa_unit_test_info.c | 42 ++++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 1 + prov/efa/test/efa_unit_tests.h | 1 + 5 files changed, 79 insertions(+), 5 deletions(-) diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c index 055f0cfb3fe..d0c3af61e36 100644 --- a/prov/efa/src/efa_prov_info.c +++ b/prov/efa/src/efa_prov_info.c @@ -442,20 +442,48 @@ static int efa_prov_info_set_nic_attr(struct fi_info *prov_info, struct efa_devi } #if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI -void efa_prov_info_set_hmem_flags(struct fi_info *prov_info) +void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum efa_info_type info_type) { - if (prov_info->ep_attr->type == FI_EP_RDM && - (ofi_hmem_is_initialized(FI_HMEM_CUDA) || + int i; + enum fi_hmem_iface iface; + struct efa_hmem_info *hmem_info; + bool enable_hmem = false; + + if ((ofi_hmem_is_initialized(FI_HMEM_CUDA) || ofi_hmem_is_initialized(FI_HMEM_NEURON) || ofi_hmem_is_initialized(FI_HMEM_SYNAPSEAI))) { + if (info_type == EFA_INFO_RDM) + enable_hmem = true; + + if (info_type == EFA_INFO_DIRECT) { + /* EFA direct only supports HMEM when p2p support is available */ + EFA_HMEM_IFACE_FOREACH(i) { + iface = efa_hmem_ifaces[i]; + hmem_info = &g_efa_hmem_info[iface]; + if (hmem_info->initialized && !hmem_info->p2p_supported_by_device) { + EFA_WARN(FI_LOG_CORE, + "EFA direct provider was compiled with support for %s HMEM interface " + "but the interface does not support p2p transfers. " + "EFA direct provider does not support HMEM transfers without p2p support. " + "HMEM support will be disabled.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE)); + goto set_hmem; + } + } + enable_hmem = true; + } + } + +set_hmem: + if (enable_hmem) { prov_info->caps |= FI_HMEM; prov_info->tx_attr->caps |= FI_HMEM; prov_info->rx_attr->caps |= FI_HMEM; prov_info->domain_attr->mr_mode |= FI_MR_HMEM; } + } #else -void efa_prov_info_set_hmem_flags(struct fi_info *prov_info) +void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum efa_info_type info_type) { } #endif @@ -536,7 +564,7 @@ int efa_prov_info_alloc(struct fi_info **prov_info_ptr, goto err_free; } - efa_prov_info_set_hmem_flags(prov_info); + efa_prov_info_set_hmem_flags(prov_info, info_type); *prov_info_ptr = prov_info; return 0; diff --git a/prov/efa/src/efa_prov_info.h b/prov/efa/src/efa_prov_info.h index c6c08765952..5ee0bde4a2b 100644 --- a/prov/efa/src/efa_prov_info.h +++ b/prov/efa/src/efa_prov_info.h @@ -22,4 +22,6 @@ int efa_prov_info_compare_domain_name(const struct fi_info *hints, int efa_prov_info_compare_pci_bus_id(const struct fi_info *hints, const struct fi_info *info); +void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum efa_info_type info_type); + #endif diff --git a/prov/efa/test/efa_unit_test_info.c b/prov/efa/test/efa_unit_test_info.c index db52ccd0594..67d4e0deaa4 100644 --- a/prov/efa/test/efa_unit_test_info.c +++ b/prov/efa/test/efa_unit_test_info.c @@ -2,6 +2,7 @@ /* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */ #include "efa_unit_tests.h" +#include "efa_prov_info.h" /** * @brief test that when a wrong fi_info was used to open resource, the error is handled @@ -113,6 +114,47 @@ void test_info_direct_attributes() } } +/** + * @brief Verify that efa direct only supports HMEM with p2p + */ +#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI +void test_info_direct_hmem_support_p2p() +{ + struct fi_info *info; + + info = fi_allocinfo(); + + memset(g_efa_hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info)); + + g_efa_hmem_info[FI_HMEM_CUDA].initialized = true; + g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = true; + hmem_ops[FI_HMEM_CUDA].initialized = true; + + efa_prov_info_set_hmem_flags(info, EFA_INFO_DIRECT); + assert_true(info->caps & FI_HMEM); + assert_true(info->tx_attr->caps & FI_HMEM); + assert_true(info->rx_attr->caps & FI_HMEM); + hmem_ops[FI_HMEM_CUDA].initialized = false; + fi_freeinfo(info); + + info = fi_allocinfo(); + g_efa_hmem_info[FI_HMEM_CUDA].initialized = true; + g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = false; + hmem_ops[FI_HMEM_CUDA].initialized = true; + + efa_prov_info_set_hmem_flags(info, EFA_INFO_DIRECT); + assert_false(info->caps & FI_HMEM); + assert_false(info->tx_attr->caps & FI_HMEM); + assert_false(info->rx_attr->caps & FI_HMEM); + hmem_ops[FI_HMEM_CUDA].initialized = false; + fi_freeinfo(info); +} +#else +void test_info_direct_hmem_support_p2p() +{ +} +#endif + /** * @brief Verify info->tx/rx_attr->msg_order is set according to hints. * diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 4586a7f8637..92f7b4d61e7 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -140,6 +140,7 @@ int main(void) cmocka_unit_test_setup_teardown(test_info_rdm_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_dgram_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_direct_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_info_direct_hmem_support_p2p, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index db438923b78..3b4a39c48d2 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -160,6 +160,7 @@ void test_info_open_ep_with_wrong_info(); void test_info_rdm_attributes(); void test_info_dgram_attributes(); void test_info_direct_attributes(); +void test_info_direct_hmem_support_p2p(); void test_info_tx_rx_msg_order_rdm_order_none(); void test_info_tx_rx_msg_order_rdm_order_sas(); void test_info_tx_rx_msg_order_dgram_order_none();