Skip to content

Commit

Permalink
prov/efa: Only support HMEM in efa-direct if p2p support is available
Browse files Browse the repository at this point in the history
efa-direct provider does not support copy based protocols, so it cannot
support HMEM transfers if the HMEM device does not have p2p support.

Signed-off-by: Sai Sunku <[email protected]>
  • Loading branch information
sunkuamzn committed Jan 31, 2025
1 parent 37ac4f4 commit c76cca8
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 5 deletions.
38 changes: 33 additions & 5 deletions prov/efa/src/efa_prov_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -442,20 +442,48 @@ static int efa_prov_info_set_nic_attr(struct fi_info *prov_info, struct efa_devi
}

#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info)
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum efa_info_type info_type)
{
if (prov_info->ep_attr->type == FI_EP_RDM &&
(ofi_hmem_is_initialized(FI_HMEM_CUDA) ||
int i;
enum fi_hmem_iface iface;
struct efa_hmem_info *hmem_info;
bool enable_hmem = false;

if ((ofi_hmem_is_initialized(FI_HMEM_CUDA) ||
ofi_hmem_is_initialized(FI_HMEM_NEURON) ||
ofi_hmem_is_initialized(FI_HMEM_SYNAPSEAI))) {
if (info_type == EFA_INFO_RDM)
enable_hmem = true;

if (info_type == EFA_INFO_DIRECT) {
/* EFA direct only supports HMEM when p2p support is available */
EFA_HMEM_IFACE_FOREACH(i) {
iface = efa_hmem_ifaces[i];
hmem_info = &g_efa_hmem_info[iface];
if (hmem_info->initialized && !hmem_info->p2p_supported_by_device) {
EFA_WARN(FI_LOG_CORE,
"EFA direct provider was compiled with support for %s HMEM interface "
"but the interface does not support p2p transfers. "
"EFA direct provider does not support HMEM transfers without p2p support. "
"HMEM support will be disabled.\n", fi_tostr(&iface, FI_TYPE_HMEM_IFACE));
goto set_hmem;
}
}
enable_hmem = true;
}
}

set_hmem:
if (enable_hmem) {
prov_info->caps |= FI_HMEM;
prov_info->tx_attr->caps |= FI_HMEM;
prov_info->rx_attr->caps |= FI_HMEM;
prov_info->domain_attr->mr_mode |= FI_MR_HMEM;
}

}
#else
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info)
void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum efa_info_type info_type)
{
}
#endif
Expand Down Expand Up @@ -536,7 +564,7 @@ int efa_prov_info_alloc(struct fi_info **prov_info_ptr,
goto err_free;
}

efa_prov_info_set_hmem_flags(prov_info);
efa_prov_info_set_hmem_flags(prov_info, info_type);

*prov_info_ptr = prov_info;
return 0;
Expand Down
2 changes: 2 additions & 0 deletions prov/efa/src/efa_prov_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ int efa_prov_info_compare_domain_name(const struct fi_info *hints,
int efa_prov_info_compare_pci_bus_id(const struct fi_info *hints,
const struct fi_info *info);

void efa_prov_info_set_hmem_flags(struct fi_info *prov_info, enum efa_info_type info_type);

#endif
42 changes: 42 additions & 0 deletions prov/efa/test/efa_unit_test_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */

#include "efa_unit_tests.h"
#include "efa_prov_info.h"

/**
* @brief test that when a wrong fi_info was used to open resource, the error is handled
Expand Down Expand Up @@ -113,6 +114,47 @@ void test_info_direct_attributes()
}
}

/**
* @brief Verify that efa direct only supports HMEM with p2p
*/
#if HAVE_CUDA || HAVE_NEURON || HAVE_SYNAPSEAI
void test_info_direct_hmem_support_p2p()
{
struct fi_info *info;

info = fi_allocinfo();

memset(g_efa_hmem_info, 0, OFI_HMEM_MAX * sizeof(struct efa_hmem_info));

g_efa_hmem_info[FI_HMEM_CUDA].initialized = true;
g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = true;
hmem_ops[FI_HMEM_CUDA].initialized = true;

efa_prov_info_set_hmem_flags(info, EFA_INFO_DIRECT);
assert_true(info->caps & FI_HMEM);
assert_true(info->tx_attr->caps & FI_HMEM);
assert_true(info->rx_attr->caps & FI_HMEM);
hmem_ops[FI_HMEM_CUDA].initialized = false;
fi_freeinfo(info);

info = fi_allocinfo();
g_efa_hmem_info[FI_HMEM_CUDA].initialized = true;
g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = false;
hmem_ops[FI_HMEM_CUDA].initialized = true;

efa_prov_info_set_hmem_flags(info, EFA_INFO_DIRECT);
assert_false(info->caps & FI_HMEM);
assert_false(info->tx_attr->caps & FI_HMEM);
assert_false(info->rx_attr->caps & FI_HMEM);
hmem_ops[FI_HMEM_CUDA].initialized = false;
fi_freeinfo(info);
}
#else
void test_info_direct_hmem_support_p2p()
{
}
#endif

/**
* @brief Verify info->tx/rx_attr->msg_order is set according to hints.
*
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ int main(void)
cmocka_unit_test_setup_teardown(test_info_rdm_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_dgram_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_direct_attributes, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_direct_hmem_support_p2p, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_rdm_order_sas, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
cmocka_unit_test_setup_teardown(test_info_tx_rx_msg_order_dgram_order_none, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown),
Expand Down
1 change: 1 addition & 0 deletions prov/efa/test/efa_unit_tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ void test_info_open_ep_with_wrong_info();
void test_info_rdm_attributes();
void test_info_dgram_attributes();
void test_info_direct_attributes();
void test_info_direct_hmem_support_p2p();
void test_info_tx_rx_msg_order_rdm_order_none();
void test_info_tx_rx_msg_order_rdm_order_sas();
void test_info_tx_rx_msg_order_dgram_order_none();
Expand Down

0 comments on commit c76cca8

Please sign in to comment.