From d0a563ebe7c0942c22023b5ef95e41b0266c4fe4 Mon Sep 17 00:00:00 2001 From: Eric Raut Date: Tue, 21 Nov 2023 21:54:50 +0000 Subject: [PATCH] Add rma_pingpong benchmark RMA latency test for write and writedata operations. * Write: polls receive buffer on receive side * Writedata: polls for completions on receiver side Signed-off-by: Eric Raut --- fabtests/Makefile.am | 6 + fabtests/Makefile.win | 4 +- fabtests/benchmarks/benchmark_shared.c | 78 ++++++++++ fabtests/benchmarks/benchmark_shared.h | 1 + fabtests/benchmarks/rma_pingpong.c | 146 +++++++++++++++++++ fabtests/common/shared.c | 115 +++++++++++++++ fabtests/include/shared.h | 5 +- fabtests/man/fabtests.7.md | 3 + fabtests/pytest/default/test_rma_pingpong.py | 17 +++ fabtests/pytest/efa/test_rma_pingpong.py | 34 +++++ fabtests/pytest/shm/test_rma_pingpong.py | 23 +++ 11 files changed, 430 insertions(+), 2 deletions(-) create mode 100644 fabtests/benchmarks/rma_pingpong.c create mode 100644 fabtests/pytest/default/test_rma_pingpong.py create mode 100644 fabtests/pytest/efa/test_rma_pingpong.py create mode 100644 fabtests/pytest/shm/test_rma_pingpong.py diff --git a/fabtests/Makefile.am b/fabtests/Makefile.am index c743d570284..e835f915616 100644 --- a/fabtests/Makefile.am +++ b/fabtests/Makefile.am @@ -52,6 +52,7 @@ bin_PROGRAMS = \ benchmarks/fi_rdm_cntr_pingpong \ benchmarks/fi_dgram_pingpong \ benchmarks/fi_rdm_pingpong \ + benchmarks/fi_rma_pingpong \ benchmarks/fi_rdm_tagged_pingpong \ benchmarks/fi_rdm_tagged_bw \ unit/fi_eq_test \ @@ -401,6 +402,11 @@ benchmarks_fi_rdm_pingpong_SOURCES = \ $(benchmarks_srcs) benchmarks_fi_rdm_pingpong_LDADD = libfabtests.la +benchmarks_fi_rma_pingpong_SOURCES = \ + benchmarks/rma_pingpong.c \ + $(benchmarks_srcs) +benchmarks_fi_rma_pingpong_LDADD = libfabtests.la + benchmarks_fi_rdm_tagged_pingpong_SOURCES = \ benchmarks/rdm_tagged_pingpong.c \ $(benchmarks_srcs) diff --git a/fabtests/Makefile.win b/fabtests/Makefile.win index 48487cd56dc..2f60516fa4e 100644 --- a/fabtests/Makefile.win +++ b/fabtests/Makefile.win @@ -74,7 +74,7 @@ all: benchmarks functional unit multinode benchmarks: $(outdir)\dgram_pingpong.exe $(outdir)\msg_bw.exe \ $(outdir)\msg_pingpong.exe $(outdir)\rdm_cntr_pingpong.exe \ - $(outdir)\rdm_pingpong.exe $(outdir)\rdm_tagged_bw.exe \ + $(outdir)\rdm_pingpong.exe $(outdir)\rma_pingpong.exe $(outdir)\rdm_tagged_bw.exe \ $(outdir)\rdm_tagged_pingpong.exe $(outdir)\rma_bw.exe functional: $(outdir)\av_xfer.exe $(outdir)\bw.exe $(outdir)\cm_data.exe $(outdir)\cq_data.exe \ @@ -107,6 +107,8 @@ $(outdir)\rdm_cntr_pingpong.exe: {benchmarks}rdm_cntr_pingpong.c $(basedeps) {be $(outdir)\rdm_pingpong.exe: {benchmarks}rdm_pingpong.c $(basedeps) {benchmarks}benchmark_shared.c +$(outdir)\rma_pingpong.exe: {benchmarks}rma_pingpong.c $(basedeps) {benchmarks}benchmark_shared.c + $(outdir)\rdm_tagged_bw.exe: {benchmarks}rdm_tagged_bw.c $(basedeps) {benchmarks}benchmark_shared.c $(outdir)\rdm_tagged_pingpong.exe: {benchmarks}rdm_tagged_pingpong.c $(basedeps) {benchmarks}benchmark_shared.c diff --git a/fabtests/benchmarks/benchmark_shared.c b/fabtests/benchmarks/benchmark_shared.c index f9997dd6e20..d649f08e1f2 100644 --- a/fabtests/benchmarks/benchmark_shared.c +++ b/fabtests/benchmarks/benchmark_shared.c @@ -143,6 +143,84 @@ int pingpong(void) return 0; } +int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote) +{ + int ret, i, inject_size; + + inject_size = inject_size_set ? + hints->tx_attr->inject_size : fi->tx_attr->inject_size; + + if (ft_check_opts(FT_OPT_ENABLE_HMEM)) + inject_size = 0; + + /* for FT_OPT_VERIFY_DATA, we cannot use inject, as we require + * completions to indicate delivery has completed. */ + if (ft_check_opts(FT_OPT_VERIFY_DATA)) + inject_size = 0; + + ret = ft_sync(); + if (ret) + return ret; + + if (opts.transfer_size == 0) { + FT_ERR("Zero-sized transfers not supported"); + return EXIT_FAILURE; + } + + /* Init rx_buf for test */ + *(rx_buf + opts.transfer_size-1) = (char)-1; + + if (opts.dst_addr) { + for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { + + if (i == opts.warmup_iterations) + ft_start(); + + /* Init tx_buf for test */ + *(tx_buf + opts.transfer_size-1) = (char)i; + + if (opts.transfer_size <= inject_size) + ret = ft_inject_rma(rma_op, remote, ep, remote_fi_addr, opts.transfer_size); + else + ret = ft_tx_rma(rma_op, remote, ep, remote_fi_addr, opts.transfer_size, &tx_ctx); + if (ret) + return ret; + + ret = ft_rx_rma(i, rma_op, ep, opts.transfer_size); + if (ret) + return ret; + } + } else { + for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) { + if (i == opts.warmup_iterations) + ft_start(); + + ret = ft_rx_rma(i, rma_op, ep, opts.transfer_size); + if (ret) + return ret; + + /* Init tx_buf for test */ + *(tx_buf + opts.transfer_size-1) = (char)i; + + if (opts.transfer_size <= inject_size) + ret = ft_inject_rma(rma_op, remote, ep, remote_fi_addr, opts.transfer_size); + else + ret = ft_tx_rma(rma_op, remote, ep, remote_fi_addr, opts.transfer_size, &tx_ctx); + if (ret) + return ret; + } + } + ft_stop(); + + if (opts.machr) + show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2, + opts.argc, opts.argv); + else + show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 2); + + return 0; +} + static int bw_tx_comp() { int ret; diff --git a/fabtests/benchmarks/benchmark_shared.h b/fabtests/benchmarks/benchmark_shared.h index ce1927b6063..1dcc7352fea 100644 --- a/fabtests/benchmarks/benchmark_shared.h +++ b/fabtests/benchmarks/benchmark_shared.h @@ -47,6 +47,7 @@ void ft_parse_benchmark_opts(int op, char *optarg); void ft_benchmark_usage(void); int pingpong(void); int bandwidth(void); +int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote); int bandwidth_rma(enum ft_rma_opcodes op, struct fi_rma_iov *remote); #ifdef __cplusplus diff --git a/fabtests/benchmarks/rma_pingpong.c b/fabtests/benchmarks/rma_pingpong.c new file mode 100644 index 00000000000..9839b15ea50 --- /dev/null +++ b/fabtests/benchmarks/rma_pingpong.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2013-2016 Intel Corporation. All rights reserved. + * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * This software is available to you under the BSD license + * below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include + +#include +#include "benchmark_shared.h" + +static int run(void) +{ + int i, ret; + + if (hints->ep_attr->type == FI_EP_MSG) { + if (!opts.dst_addr) { + ret = ft_start_server(); + if (ret) + return ret; + } + + ret = opts.dst_addr ? ft_client_connect() : ft_server_connect(); + } else { + ret = ft_init_fabric(); + } + if (ret) + return ret; + + ret = ft_exchange_keys(&remote); + if (ret) + return ret; + + if (!(opts.options & FT_OPT_SIZE)) { + for (i = 0; i < TEST_CNT; i++) { + if (!ft_use_size(i, opts.sizes_enabled)) + continue; + opts.transfer_size = test_size[i].size; + init_test(&opts, test_name, sizeof(test_name)); + ret = pingpong_rma(opts.rma_op, &remote); + if (ret) + goto out; + } + } else { + init_test(&opts, test_name, sizeof(test_name)); + ret = pingpong_rma(opts.rma_op, &remote); + if (ret) + goto out; + } + + ft_finalize(); +out: + return ret; +} + +int main(int argc, char **argv) +{ + int op, ret; + + opts = INIT_OPTS; + + hints = fi_allocinfo(); + if (!hints) + return EXIT_FAILURE; + + hints->caps = FI_MSG | FI_RMA | FI_WRITE | FI_REMOTE_WRITE; + hints->domain_attr->resource_mgmt = FI_RM_ENABLED; + hints->mode = FI_CONTEXT; + hints->domain_attr->threading = FI_THREAD_DOMAIN; + hints->addr_format = opts.address_format; + + while ((op = getopt_long(argc, argv, "Uh" CS_OPTS INFO_OPTS API_OPTS + BENCHMARK_OPTS, long_opts, &lopt_idx)) != -1) { + switch (op) { + default: + if (!ft_parse_long_opts(op, optarg)) + continue; + ft_parse_benchmark_opts(op, optarg); + ft_parseinfo(op, optarg, hints, &opts); + ft_parsecsopts(op, optarg, &opts); + ret = ft_parse_api_opts(op, optarg, hints, &opts); + if (ret) + return ret; + break; + case 'U': + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + break; + case '?': + case 'h': + ft_csusage(argv[0], "Pingpong test using RMA operations."); + ft_benchmark_usage(); + FT_PRINT_OPTS_USAGE("-o ", "rma op type: write|writedata (default: write)\n"); + ft_longopts_usage(); + return EXIT_FAILURE; + } + } + + /* We only support write and writedata verbs */ + if (opts.rma_op != FT_RMA_WRITE && opts.rma_op != FT_RMA_WRITEDATA) { + FT_ERR("Only write and writedata verbs are supported by rma_pingpong"); + return EXIT_FAILURE; + } + + /* data validation on read and write ops requires delivery_complete semantics. */ + if (opts.rma_op != FT_RMA_WRITEDATA && ft_check_opts(FT_OPT_VERIFY_DATA)) + hints->tx_attr->op_flags |= FI_DELIVERY_COMPLETE; + + if (optind < argc) + opts.dst_addr = argv[optind]; + + hints->domain_attr->mr_mode = opts.mr_mode; + hints->tx_attr->tclass = FI_TC_LOW_LATENCY; + + ret = run(); + + ft_free_res(); + return -ret; +} diff --git a/fabtests/common/shared.c b/fabtests/common/shared.c index 2ea24425bfc..cb90b390a03 100644 --- a/fabtests/common/shared.c +++ b/fabtests/common/shared.c @@ -2170,6 +2170,25 @@ ssize_t ft_tx(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx) return ret; } +ssize_t ft_tx_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote, struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx) +{ + ssize_t ret; + + if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) { + /* Fill data. Last byte reserved for iteration number */ + ret = ft_fill_buf((char *) tx_buf, size-1); + if (ret) + return ret; + } + + ret = ft_post_rma(rma_op, tx_buf, size, remote, ctx); + if (ret) + return ret; + + ret = ft_get_tx_comp(tx_seq); + return ret; +} + ssize_t ft_post_inject_buf(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *op_buf, uint64_t op_tag) { @@ -2209,6 +2228,24 @@ ssize_t ft_inject(struct fid_ep *ep, fi_addr_t fi_addr, size_t size) return ret; } +ssize_t ft_inject_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote, struct fid_ep *ep, fi_addr_t fi_addr, size_t size) +{ + ssize_t ret; + + if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) { + /* Fill data. Last byte reserved for iteration number */ + ret = ft_fill_buf((char *) tx_buf, size-1); + if (ret) + return ret; + } + + ret = ft_post_rma_inject(rma_op, tx_buf, size, remote); + if (ret) + return ret; + + return ret; +} + static size_t ft_remote_write_offset(const char *buf) { assert(buf >= tx_buf && buf < (tx_buf + tx_buf_size)); @@ -2423,6 +2460,49 @@ ssize_t ft_rx(struct fid_ep *ep, size_t size) return ret; } +ssize_t ft_rx_rma(int iter, enum ft_rma_opcodes rma_op, struct fid_ep *ep, size_t size) +{ + ssize_t ret; + + switch (rma_op) { + case FT_RMA_WRITE: + /* In this case, there will be no completion on the remote side. Instead, poll the recv buff. */ + ret = ft_rma_poll_buf(rx_buf, iter, size); + if (ret) + return ret; + break; + case FT_RMA_WRITEDATA: + /* In this case, a completion will be generated on the remote side, so wait for it. */ + ret = ft_get_rx_comp(rx_seq); + if (ret) + return ret; + break; + default: + FT_ERR("Unsupported RMA op type"); + return EXIT_FAILURE; + } + + if (ft_check_opts(FT_OPT_VERIFY_DATA | FT_OPT_ACTIVE)) { + ret = ft_check_buf((char *) rx_buf, size-1); + if (ret) + return ret; + } + + /* TODO: verify CQ data, if available */ + + if (rma_op == FT_RMA_WRITEDATA) { + if (fi->rx_attr->mode & FI_RX_CQ_DATA) { + /* In this mode, the next RDMA write op will consume a receive, so post one here. */ + ret = ft_post_rx(ep, 0, &rx_ctx); + } else { + /* Just increment seq counter */ + rx_seq++; + } + } + + return ret; +} + /* * Received messages match tagged buffers in order, but the completions can be * reported out of order. A tag is valid if it's within the current window. @@ -3565,6 +3645,41 @@ int ft_check_buf(void *buf, size_t size) return ret; } +int ft_rma_poll_buf(void *buf, int iter, size_t size) +{ + volatile char *recv_data; + struct timespec a, b; + + if (opts.iface != FI_HMEM_SYSTEM) { + /* Not supported */ + FT_ERR("FI_HMEM not supported for writedata latency test"); + return EXIT_FAILURE; + } else { + recv_data = (char *)buf + size - 1; + } + + if (timeout >= 0) + clock_gettime(CLOCK_MONOTONIC, &a); + + char expected_val = (char)iter; + while (*recv_data != expected_val) { + /* Although not expecting a completion, we must process the completion queue + * to make progress */ + ft_force_progress(); + + /* Check for timeout */ + if (timeout >= 0) { + clock_gettime(CLOCK_MONOTONIC, &b); + if ((b.tv_sec - a.tv_sec) > timeout) { + fprintf(stderr, "%ds timeout expired\n", timeout); + return -FI_ENODATA; + } + } + } + + return 0; +} + uint64_t ft_init_cq_data(struct fi_info *info) { if (info->domain_attr->cq_data_size >= sizeof(uint64_t)) { diff --git a/fabtests/include/shared.h b/fabtests/include/shared.h index 04a7e57246a..eb084441fd0 100644 --- a/fabtests/include/shared.h +++ b/fabtests/include/shared.h @@ -560,16 +560,19 @@ ssize_t ft_post_tx_buf(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, uint64_t data, void *ctx, void *op_buf, void *op_mr_desc, uint64_t op_tag); ssize_t ft_rx(struct fid_ep *ep, size_t size); +ssize_t ft_rx_rma(int iter, enum ft_rma_opcodes rma_op, struct fid_ep *ep, size_t size); ssize_t ft_tx(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx); +ssize_t ft_tx_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote, struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *ctx); ssize_t ft_post_inject_buf(struct fid_ep *ep, fi_addr_t fi_addr, size_t size, void *op_buf, uint64_t op_tag); ssize_t ft_post_inject(struct fid_ep *ep, fi_addr_t fi_addr, size_t size); ssize_t ft_inject(struct fid_ep *ep, fi_addr_t fi_addr, size_t size); +ssize_t ft_inject_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote, struct fid_ep *ep, fi_addr_t fi_addr, size_t size); ssize_t ft_post_rma(enum ft_rma_opcodes op, char *buf, size_t size, struct fi_rma_iov *remote, void *context); ssize_t ft_post_rma_inject(enum ft_rma_opcodes op, char *buf, size_t size, struct fi_rma_iov *remote); - +int ft_rma_poll_buf(void *buf, int iter, size_t size); ssize_t ft_post_atomic(enum ft_atomic_opcodes opcode, struct fid_ep *ep, void *compare, void *compare_desc, void *result, diff --git a/fabtests/man/fabtests.7.md b/fabtests/man/fabtests.7.md index af4255d6915..b49e4e0a5f5 100644 --- a/fabtests/man/fabtests.7.md +++ b/fabtests/man/fabtests.7.md @@ -191,6 +191,9 @@ given provider or system may achieve. *fi_rma_bw* : An RMA read and write bandwidth test for reliable (MSG and RDM) endpoints. +*fi_rma_pingpong* +: An RMA write and writedata latency test for reliable-datagram (RDM) endpoints. + ## Unit These are simple one-sided unit tests that validate basic behavior of the API. diff --git a/fabtests/pytest/default/test_rma_pingpong.py b/fabtests/pytest/default/test_rma_pingpong.py new file mode 100644 index 00000000000..f2fef071fc1 --- /dev/null +++ b/fabtests/pytest/default/test_rma_pingpong.py @@ -0,0 +1,17 @@ +import pytest + +@pytest.mark.parametrize("operation_type", ["writedata", "write"]) +@pytest.mark.parametrize("endpoint_type", ["msg", "rdm"]) +@pytest.mark.parametrize("iteration_type", + [pytest.param("short", marks=pytest.mark.short), + pytest.param("standard", marks=pytest.mark.standard)]) +def test_rma_pingpong(cmdline_args, iteration_type, endpoint_type, operation_type, completion_semantic): + from common import ClientServerTest + + command = "fi_rma_pingpong" + command = command + " -e " + endpoint_type + command = command + " -o " + operation_type + test = ClientServerTest(cmdline_args, command, iteration_type, + completion_semantic=completion_semantic) + test.run() + diff --git a/fabtests/pytest/efa/test_rma_pingpong.py b/fabtests/pytest/efa/test_rma_pingpong.py new file mode 100644 index 00000000000..900166091a5 --- /dev/null +++ b/fabtests/pytest/efa/test_rma_pingpong.py @@ -0,0 +1,34 @@ +from efa.efa_common import efa_run_client_server_test +import pytest + + +@pytest.mark.parametrize("operation_type", ["writedata", "write"]) +@pytest.mark.parametrize("iteration_type", + [pytest.param("short", marks=pytest.mark.short), + pytest.param("standard", marks=pytest.mark.standard)]) +def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): + command = "fi_rma_pingpong -e rdm" + command = command + " -o " + operation_type + # rma_pingpong test with data verification takes longer to finish + timeout = max(540, cmdline_args.timeout) + efa_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) + + +@pytest.mark.functional +@pytest.mark.parametrize("operation_type", ["writedata", "write"]) +def test_rma_pingpong_range(cmdline_args, operation_type, completion_semantic, message_size, memory_type): + command = "fi_rma_pingpong -e rdm" + command = command + " -o " + operation_type + # rma_pingpong test with data verification takes longer to finish + timeout = max(540, cmdline_args.timeout) + efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, message_size, timeout=timeout) + + +@pytest.mark.functional +@pytest.mark.parametrize("operation_type", ["writedata", "write"]) +def test_rma_pingpong_range_no_inject(cmdline_args, operation_type, completion_semantic, inject_message_size): + command = "fi_rma_pingpong -e rdm -j 0" + command = command + " -o " + operation_type + # rma_pingpong test with data verification takes longer to finish + timeout = max(540, cmdline_args.timeout) + efa_run_client_server_test(cmdline_args, command, "short", completion_semantic, "host_to_host", inject_message_size, timeout=timeout) diff --git a/fabtests/pytest/shm/test_rma_pingpong.py b/fabtests/pytest/shm/test_rma_pingpong.py new file mode 100644 index 00000000000..abc9ae9678e --- /dev/null +++ b/fabtests/pytest/shm/test_rma_pingpong.py @@ -0,0 +1,23 @@ +import pytest +from shm.shm_common import shm_run_client_server_test + + +@pytest.mark.parametrize("operation_type", ["writedata", "write"]) +@pytest.mark.parametrize("iteration_type", + [pytest.param("short", marks=pytest.mark.short), + pytest.param("standard", marks=pytest.mark.standard)]) +def test_rma_pingpong(cmdline_args, iteration_type, operation_type, completion_semantic, memory_type): + command = "fi_rma_pingpong -e rdm" + command = command + " -o " + operation_type + # rma_pingpong test with data verification takes longer to finish + timeout = max(540, cmdline_args.timeout) + shm_run_client_server_test(cmdline_args, command, iteration_type, completion_semantic, memory_type, "all", timeout=timeout) + +@pytest.mark.functional +@pytest.mark.parametrize("operation_type", ["writedata", "write"]) +def test_rma_pingpong_range(cmdline_args, operation_type, completion_semantic, message_size, memory_type): + command = "fi_rma_pingpong -e rdm" + command = command + " -o " + operation_type + # rma_pingpong test with data verification takes longer to finish + timeout = max(540, cmdline_args.timeout) + shm_run_client_server_test(cmdline_args, command, "short", completion_semantic, memory_type, message_size, timeout=timeout)