Skip to content

Commit

Permalink
prov/efa: Only support HMEM in efa-direct if p2p support is available
Browse files Browse the repository at this point in the history
efa-direct provider does not support copy based protocols, so it cannot
support HMEM transfers if the HMEM device does not have p2p support.

Signed-off-by: Sai Sunku <[email protected]>
  • Loading branch information
sunkuamzn committed Feb 5, 2025
1 parent 5ba9e49 commit 76c9ecf
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 5 deletions.
34 changes: 29 additions & 5 deletions prov/efa/src/efa_prov_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -388,20 +388,44 @@ static int efa_prov_info_set_nic_attr(struct fi_info *prov_info, struct efa_devi
}

#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info)
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum fi_ep_type ep_type)
{
if (prov_info->ep_attr->type == FI_EP_RDM &&
(ofi_hmem_is_initialized(FI_HMEM_CUDA) ||
int i;
enum fi_hmem_iface iface;
struct efa_hmem_info *hmem_info;
bool enable_hmem = false;

if (ep_type != FI_EP_RDM)
return;

/* EFA direct only supports HMEM when p2p support is available */
if ((ofi_hmem_is_initialized(FI_HMEM_CUDA) ||
ofi_hmem_is_initialized(FI_HMEM_NEURON) ||
ofi_hmem_is_initialized(FI_HMEM_SYNAPSEAI))) {
EFA_HMEM_IFACE_FOREACH(i) {
iface = efa_hmem_ifaces[i];
hmem_info = &g_efa_hmem_info[iface];
if (hmem_info->initialized && !hmem_info->p2p_supported_by_device) {
EFA_INFO(FI_LOG_CORE,
"EFA direct provider was compiled with support for %s HMEM interface "
"but the interface does not support p2p transfers. "
"EFA direct provider does not support HMEM transfers without p2p support. "
"HMEM support will be disabled.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
return;
}
}
enable_hmem = true;
}

if (enable_hmem) {
prov_info->caps |= FI_HMEM;
prov_info->tx_attr->caps |= FI_HMEM;
prov_info->rx_attr->caps |= FI_HMEM;
prov_info->domain_attr->mr_mode |= FI_MR_HMEM;
}
}
#else
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info)
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum fi_ep_type ep_type)
{
}
#endif
Expand Down Expand Up @@ -480,7 +504,7 @@ int efa_prov_info_alloc(struct fi_info **prov_info_ptr,
goto err_free;
}

efa_prov_info_set_hmem_flags(prov_info);
efa_prov_info_set_hmem_flags(prov_info, ep_type);

*prov_info_ptr = prov_info;
return 0;
Expand Down
2 changes: 2 additions & 0 deletions prov/efa/src/efa_prov_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ int efa_prov_info_compare_domain_name(const struct fi_info *hints,
int efa_prov_info_compare_pci_bus_id(const struct fi_info *hints,
const struct fi_info *info);

void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum fi_ep_type ep_type);

#endif
55 changes: 55 additions & 0 deletions prov/efa/test/efa_unit_test_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */

#include "efa_unit_tests.h"
#include "efa_prov_info.h"

/**
* @brief test that when a wrong fi_info was used to open resource, the error is handled
Expand Down Expand Up @@ -113,6 +114,60 @@ void test_info_direct_attributes()
}
}

/**
* @brief Verify that efa direct only supports HMEM with p2p
*/
#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
void test_info_direct_hmem_support_p2p()
{
struct fi_info *info;
bool hmem_ops_cuda_init;

info = fi_allocinfo();

memset(g_efa_hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info));

/* Save current value of hmem_ops[FI_HMEM_CUDA].initialized to reset later
* hmem_ops is populated in ofi_hmem_init and only runs once
*
* CUDA iface will be initialized on Nvidia GPU platforms but not on others
* Force setting hmem_ops[FI_HMEM_CUDA].initialized allows this test to
* run on all instance types
*/
hmem_ops_cuda_init = hmem_ops[FI_HMEM_CUDA].initialized;
hmem_ops[FI_HMEM_CUDA].initialized = true;

/* g_efa_hmem_info is populated in efa_hmem_info_initialize which runs on
* every fi_getinfo call. So no need to save and reset these fields
*/
g_efa_hmem_info[FI_HMEM_CUDA].initialized = true;
g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = true;

efa_prov_info_set_hmem_flags(info, FI_EP_RDM);
assert_true(info->caps & FI_HMEM);
assert_true(info->tx_attr->caps & FI_HMEM);
assert_true(info->rx_attr->caps & FI_HMEM);
fi_freeinfo(info);

info = fi_allocinfo();
g_efa_hmem_info[FI_HMEM_CUDA].initialized = true;
g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = false;

efa_prov_info_set_hmem_flags(info, FI_EP_RDM);
assert_false(info->caps & FI_HMEM);
assert_false(info->tx_attr->caps & FI_HMEM);
assert_false(info->rx_attr->caps & FI_HMEM);
fi_freeinfo(info);

/* Reset hmem_ops[FI_HMEM_CUDA].initialized */
hmem_ops[FI_HMEM_CUDA].initialized = hmem_ops_cuda_init;
}
#else
void test_info_direct_hmem_support_p2p()
{
}
#endif

/**
* @brief Verify info->tx/rx_attr->msg_order is set according to hints.
*
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ int main(void)
cmocka_unit_test_setup_teardown(test_info_rdm_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_dgram_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_direct_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_direct_hmem_support_p2p, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ void test_info_open_ep_with_wrong_info();
void test_info_rdm_attributes();
void test_info_dgram_attributes();
void test_info_direct_attributes();
void test_info_direct_hmem_support_p2p();
void test_info_tx_rx_msg_order_rdm_order_none();
void test_info_tx_rx_msg_order_rdm_order_sas();
void test_info_tx_rx_msg_order_dgram_order_none();
Expand Down

0 comments on commit 76c9ecf

Please sign in to comment.