From 48c1d837e160391d5932e57c51fdbb2fda904f8b Mon Sep 17 00:00:00 2001 From: doujiang24 Date: Thu, 13 Feb 2025 14:08:54 +0800 Subject: [PATCH] [TransferEngine] fix: use ibv_get_device_list to get the IB devices. In container, we might just mount part of ubvers into the pod, i.e. ``` $ ls /sys/class/infiniband erdma_0 erdma_1 $ ls /dev/infiniband/ rdma_cm uverbs0 $ ibv_devices device node GUID ------ ---------------- erdma_0 02163efffe1041ea ``` Signed-off-by: doujiang24 --- mooncake-transfer-engine/src/topology.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/mooncake-transfer-engine/src/topology.cpp b/mooncake-transfer-engine/src/topology.cpp index 1474503..4c2a908 100644 --- a/mooncake-transfer-engine/src/topology.cpp +++ b/mooncake-transfer-engine/src/topology.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -44,20 +45,17 @@ struct InfinibandDevice { }; static std::vector listInfiniBandDevices() { - DIR *dir = opendir("/sys/class/infiniband"); - struct dirent *entry; + int num_devices = 0; std::vector devices; - if (dir == NULL) { - PLOG(WARNING) << "Failed to open /sys/class/infiniband"; + struct ibv_device **device_list = ibv_get_device_list(&num_devices); + if (!device_list || num_devices <= 0) { + LOG(WARNING) << "No IB devices found"; return {}; } - while ((entry = readdir(dir))) { - if (entry->d_name[0] == '.') { - continue; - } - std::string device_name = entry->d_name; + for (int i = 0; i < num_devices; ++i) { + std::string device_name = ibv_get_device_name(device_list[i]); char path[PATH_MAX + 32]; char resolved_path[PATH_MAX]; @@ -65,7 +63,7 @@ static std::vector listInfiniBandDevices() { // "/sys/class/infiniband/mlx5_X/" is a symlink to // "/sys/devices/pciXXXX:XX/XXXX:XX:XX.X/infiniband/mlx5_X/". snprintf(path, sizeof(path), "/sys/class/infiniband/%s/../..", - entry->d_name); + device_name.c_str()); if (realpath(path, resolved_path) == NULL) { PLOG(ERROR) << "Failed to parse realpath"; continue; @@ -80,7 +78,6 @@ static std::vector listInfiniBandDevices() { .pci_bus_id = std::move(pci_bus_id), .numa_node = numa_node}); } - (void)closedir(dir); return devices; }