From b7f823dc1a5899ec89450e28fd55042052be8719 Mon Sep 17 00:00:00 2001 From: CamStan Date: Mon, 19 Apr 2021 12:54:59 -0700 Subject: [PATCH 01/81] Fix: bootstrap space issue 618 This fixes the spacing issue that shows up at the end of the bootstrap script when printing out the information to install UnifyFS. --- bootstrap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bootstrap.sh b/bootstrap.sh index 0054d4070..a7e90ac0d 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -268,7 +268,7 @@ echo -n " export LD_LIBRARY_PATH=$INSTALL_DIR/lib:$INSTALL_DIR/lib64:" echo "\$LD_LIBRARY_PATH" echo " ./autogen.sh" -echo -n " ./configure --prefix=$INSTALL_DIR CPPFLAGS=-I$INSTALL_DIR/include" +echo -n " ./configure --prefix=$INSTALL_DIR CPPFLAGS=-I$INSTALL_DIR/include " echo "LDFLAGS=\"-L$INSTALL_DIR/lib -L$INSTALL_DIR/lib64\"" echo " make && make install" echo "" From ad7ff1028f670ae2fc98f3761e4a5f9bbace2f72 Mon Sep 17 00:00:00 2001 From: Jenna Delozier Date: Thu, 1 Apr 2021 11:23:39 -0400 Subject: [PATCH 02/81] Refactor staging functions Fix formatting Reformat changes --- util/unifyfs/src/unifyfs-rm.c | 100 +++++++++++++--------------------- 1 file changed, 39 insertions(+), 61 deletions(-) diff --git a/util/unifyfs/src/unifyfs-rm.c b/util/unifyfs/src/unifyfs-rm.c index eae354699..327840be0 100644 --- a/util/unifyfs/src/unifyfs-rm.c +++ b/util/unifyfs/src/unifyfs-rm.c @@ -774,6 +774,29 @@ static int invalid_stage(unifyfs_resource_t* resource, return -ENOSYS; } +static int generic_stage(char* cmd, int run_argc, unifyfs_args_t* args) +{ + size_t argc, stage_argc; + char** argv = NULL; + + stage_argc = construct_stage_argv(args, NULL); + + char* token = strtok(cmd, " "); + argc = 1 + run_argc + stage_argc; + argv = calloc(argc, sizeof(char*)); + + for (int i = 0; i < run_argc; i++) { + argv[i] = token; + token = strtok(NULL, " "); + } + + construct_stage_argv(args, argv + run_argc); + + execvp(argv[0], argv); + + return -errno; +} + /** * @brief Launch servers using IBM jsrun * @@ -871,35 +894,16 @@ static int jsrun_terminate(unifyfs_resource_t* resource, static int jsrun_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) { - size_t argc, jsrun_argc, stage_argc; - char** argv = NULL; - char n_nodes[16]; + size_t jsrun_argc = 13; + char cmd[200]; // full command: jsrun - jsrun_argc = 13; - snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); - - stage_argc = construct_stage_argv(args, NULL); + snprintf(cmd, sizeof(cmd), + "jsrun --immediate -e -individual --stdio_stderr unifyfs-stage.err.%%h.%%p --stdio_stdout unifyfs-stage.out.%%h.%%p --nrs %zu -r1 -c1 -a1", + resource->n_nodes); - // setup full command argv - argc = 1 + jsrun_argc + stage_argc; - argv = calloc(argc, sizeof(char*)); - argv[0] = strdup("jsrun"); - argv[1] = strdup("--immediate"); - argv[2] = strdup("-e"); - argv[3] = strdup("individual"); - argv[4] = strdup("--stdio_stderr"); - argv[5] = strdup("unifyfs-stage.err.%h.%p"); - argv[6] = strdup("--stdio_stdout"); - argv[7] = strdup("unifyfs-stage.out.%h.%p"); - argv[8] = strdup("--nrs"); - argv[9] = strdup(n_nodes); - argv[10] = strdup("-r1"); - argv[11] = strdup("-c1"); - argv[12] = strdup("-a1"); - construct_stage_argv(args, argv + jsrun_argc); + generic_stage(cmd, jsrun_argc, args); - execvp(argv[0], argv); perror("failed to execvp() mpirun to handle data stage"); return -errno; } @@ -988,28 +992,15 @@ static int mpirun_terminate(unifyfs_resource_t* resource, static int mpirun_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) { - size_t argc, mpirun_argc, stage_argc; - char** argv = NULL; - char n_nodes[16]; + size_t mpirun_argc = 5; + char cmd[200]; // full command: mpirun + snprintf(cmd, sizeof(cmd), "mpirun --np %zu --map-by ppr:1:node", + resource->n_nodes); - mpirun_argc = 5; - snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); - - stage_argc = construct_stage_argv(args, NULL); - - // setup full command argv - argc = 1 + mpirun_argc + stage_argc; - argv = calloc(argc, sizeof(char*)); - argv[0] = strdup("mpirun"); - argv[1] = strdup("-np"); - argv[2] = strdup(n_nodes); - argv[3] = strdup("--map-by"); - argv[4] = strdup("ppr:1:node"); - construct_stage_argv(args, argv + mpirun_argc); + generic_stage(cmd, mpirun_argc, args); - execvp(argv[0], argv); perror("failed to execvp() mpirun to handle data stage"); return -errno; } @@ -1098,28 +1089,15 @@ static int srun_terminate(unifyfs_resource_t* resource, static int srun_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) { - size_t argc, srun_argc, stage_argc; - char** argv = NULL; - char n_nodes[16]; + size_t srun_argc = 5; + char cmd[200]; // full command: srun + snprintf(cmd, sizeof(cmd), "srun -N %zu --ntasks-per-node 1", + resource->n_nodes); - srun_argc = 5; - snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); - - stage_argc = construct_stage_argv(args, NULL); + generic_stage(cmd, srun_argc, args); - // setup full command argv - argc = 1 + srun_argc + stage_argc; - argv = calloc(argc, sizeof(char*)); - argv[0] = strdup("srun"); - argv[1] = strdup("-N"); - argv[2] = strdup(n_nodes); - argv[3] = strdup("--ntasks-per-node"); - argv[4] = strdup("1"); - construct_stage_argv(args, argv + srun_argc); - - execvp(argv[0], argv); perror("failed to execvp() srun to launch unifyfsd"); return -errno; } From 163f2c0045b5300f827d9577c9229d86956148ba Mon Sep 17 00:00:00 2001 From: CamStan Date: Tue, 4 May 2021 18:37:51 -0700 Subject: [PATCH 03/81] Add additional compilers for Gitlab CI testing This adds additional Gitlab CI jobs for available newer versions of gcc on Lassen, Ascent, and Catalyst. Also adds jobs for xl and icc, but currently leaving them off until some issues are resolved when building with these compilers. --- .gitlab/ascent.yml | 59 ++++++++++++++++++++++++++++++++++++++++++++ .gitlab/catalyst.yml | 51 ++++++++++++++++++++++++++++++++++++++ .gitlab/lassen.yml | 51 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+) diff --git a/.gitlab/ascent.yml b/.gitlab/ascent.yml index d997f6f9b..b961cc741 100644 --- a/.gitlab/ascent.yml +++ b/.gitlab/ascent.yml @@ -26,6 +26,7 @@ ##### All Ascent Jobs ##### +### gcc@4.8.5 ### ascent-gcc-4_8_5-init: extends: [.ascent-shell-template, .init-template] @@ -52,3 +53,61 @@ ascent-gcc-4_8_5-integ-test: FC_COMMAND: "which gfortran" extends: [.lsf-multi-node-template, .ascent-batch-template, .integ-test-template] needs: ["ascent-gcc-4_8_5-build"] + + +### gcc@10.2.0 ### +ascent-gcc-10_2_0-init: + extends: [.ascent-shell-template, .init-template] + +ascent-gcc-10_2_0-build: + variables: + COMPILER: gcc/10.2.0 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.ascent-shell-template, .build-template] + needs: ["ascent-gcc-10_2_0-init"] + +ascent-gcc-10_2_0-unit-test: + variables: + COMPILER: gcc/10.2.0 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-single-node-template, .ascent-batch-template, .unit-test-template] + needs: ["ascent-gcc-10_2_0-build"] + +ascent-gcc-10_2_0-integ-test: + variables: + COMPILER: gcc/10.2.0 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-multi-node-template, .ascent-batch-template, .integ-test-template] + needs: ["ascent-gcc-10_2_0-build"] + + +### xl@16.1 ### +#ascent-xl-16_1-init: +# extends: [.ascent-shell-template, .init-template] +# +#ascent-xl-16_1-build: +# variables: +# COMPILER: xl/16.1 +# CC_COMMAND: "which xlc" +# FC_COMMAND: "which xlf90" +# extends: [.ascent-shell-template, .build-template] +# needs: ["ascent-xl-16_1-init"] +# +#ascent-xl-16_1-unit-test: +# variables: +# COMPILER: xl/16.1 +# CC_COMMAND: "which xlc" +# FC_COMMAND: "which xlf90" +# extends: [.lsf-single-node-template, .ascent-batch-template, .unit-test-template] +# needs: ["ascent-xl-16_1-build"] +# +#ascent-xl-16_1-integ-test: +# variables: +# COMPILER: xlc/16.1 +# CC_COMMAND: "which xlc" +# FC_COMMAND: "which xlf90" +# extends: [.lsf-multi-node-template, .ascent-batch-template, .integ-test-template] +# needs: ["ascent-xl-16_1-build"] diff --git a/.gitlab/catalyst.yml b/.gitlab/catalyst.yml index f3533548c..18b96159d 100644 --- a/.gitlab/catalyst.yml +++ b/.gitlab/catalyst.yml @@ -23,6 +23,7 @@ ##### All Catalyst Jobs ##### +### gcc@4.9.3 ### catalyst-gcc-4_9_3-build: variables: COMPILER: gcc/4.9.3 @@ -45,3 +46,53 @@ catalyst-gcc-4_9_3-integ-test: FC_COMMAND: "which gfortran" extends: [.slurm-multi-node-template, .catalyst-batch-template, .integ-test-template] needs: ["catalyst-gcc-4_9_3-build"] + + +### gcc@10.2.1 ### +catalyst-gcc-10_2_1-build: + variables: + COMPILER: gcc/10.2.1 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.catalyst-shell-template, .build-template] + +catalyst-gcc-10_2_1-unit-test: + variables: + COMPILER: gcc/10.2.1 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.slurm-single-node-template, .catalyst-batch-template, .unit-test-template] + needs: ["catalyst-gcc-10_2_1-build"] + +catalyst-gcc-10_2_1-integ-test: + variables: + COMPILER: gcc/10.2.1 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .integ-test-template] + needs: ["catalyst-gcc-10_2_1-build"] + + +### intel@19.0.4 ### +#catalyst-intel-19_0_4-build: +# variables: +# COMPILER: intel/19.0.4 +# CC_COMMAND: "which icc" +# FC_COMMAND: "which ifort" +# extends: [.catalyst-shell-template, .build-template] +# +#catalyst-intel-19_0_4-unit-test: +# variables: +# COMPILER: intel/19.0.4 +# CC_COMMAND: "which icc" +# FC_COMMAND: "which ifort" +# extends: [.slurm-single-node-template, .catalyst-batch-template, .unit-test-template] +# needs: ["catalyst-intel-19_0_4-build"] +# +#catalyst-intel-19_0_4-integ-test: +# variables: +# COMPILER: intel/19.0.4 +# CC_COMMAND: "which icc" +# FC_COMMAND: "which ifort" +# extends: [.slurm-multi-node-template, .catalyst-batch-template, .integ-test-template] +# needs: ["catalyst-intel-19_0_4-build"] diff --git a/.gitlab/lassen.yml b/.gitlab/lassen.yml index a34a6f587..a4b9a79b9 100644 --- a/.gitlab/lassen.yml +++ b/.gitlab/lassen.yml @@ -23,6 +23,7 @@ ##### All Lassen Jobs ##### +### gcc@4.9.3 ### lassen-gcc-4_9_3-build: variables: COMPILER: gcc/4.9.3 @@ -45,3 +46,53 @@ lassen-gcc-4_9_3-integ-test: FC_COMMAND: "which gfortran" extends: [.lsf-multi-node-template, .lassen-batch-template, .integ-test-template] needs: ["lassen-gcc-4_9_3-build"] + + +### gcc@8.3.1 ### +lassen-gcc-8_3_1-build: + variables: + COMPILER: gcc/8.3.1 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lassen-shell-template, .build-template] + +lassen-gcc-8_3_1-unit-test: + variables: + COMPILER: gcc/8.3.1 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-single-node-template, .lassen-batch-template, .unit-test-template] + needs: ["lassen-gcc-8_3_1-build"] + +lassen-gcc-8_3_1-integ-test: + variables: + COMPILER: gcc/8.3.1 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + extends: [.lsf-multi-node-template, .lassen-batch-template, .integ-test-template] + needs: ["lassen-gcc-8_3_1-build"] + + +### xl@16.1 ### +#lassen-xl-16_1-build: +# variables: +# COMPILER: xl/16.1 +# CC_COMMAND: "which xlc" +# FC_COMMAND: "which xlf90" +# extends: [.lassen-shell-template, .build-template] +# +#lassen-xl-16_1-unit-test: +# variables: +# COMPILER: xl/16.1 +# CC_COMMAND: "which xlc" +# FC_COMMAND: "which xlf90" +# extends: [.lsf-single-node-template, .lassen-batch-template, .unit-test-template] +# needs: ["lassen-xl-16_1-build"] +# +#lassen-xl-16_1-integ-test: +# variables: +# COMPILER: xlc/16.1 +# CC_COMMAND: "which xlc" +# FC_COMMAND: "which xlf90" +# extends: [.lsf-multi-node-template, .lassen-batch-template, .integ-test-template] +# needs: ["lassen-xl-16_1-build"] From a2c8d6c02365544362ae0a2618933484e1e6728c Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Tue, 28 Apr 2020 12:09:15 -0400 Subject: [PATCH 04/81] initial client API library implementation * implements the alpha version of the client API and library. * includes new unit tests to exercise the library. Also includes: * fix for missing req completion notice for hole extent * fix for write after laminate * eliminate shadow copy of file attributes in client metadata * allow symbolic file access permissions in checkpatch * fix unifyfs-stage unit tests * fix to clean up logio shmem regions on server exit TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.c" TEST_CHECKPATCH_SKIP_FILES+=",t/0700-unifyfs-stage-full.t" TEST_CHECKPATCH_SKIP_FILES+=",t/9300-unifyfs-stage-isolated.t" --- .travis.yml | 1 + client/src/Makefile.am | 54 +- client/src/client_read.c | 8 + client/src/client_transfer.c | 344 +++++++++++++ client/src/client_transfer.h | 36 ++ client/src/gotcha_map_unifyfs_list.c | 2 + client/src/margo_client.c | 8 +- client/src/margo_client.h | 10 +- client/src/unifyfs-dirops.c | 6 +- client/src/unifyfs-fixed.c | 12 +- client/src/unifyfs-fixed.h | 2 +- client/src/unifyfs-internal.h | 41 +- client/src/unifyfs-stdio.h | 3 - client/src/unifyfs-sysio.c | 10 +- client/src/unifyfs-sysio.h | 2 + client/src/unifyfs.c | 476 ++++-------------- client/src/unifyfs_api.c | 204 ++++++++ client/src/unifyfs_api.h | 341 +++++++++++++ client/src/unifyfs_api_file.c | 200 ++++++++ client/src/unifyfs_api_internal.h | 37 ++ client/src/unifyfs_api_io.c | 403 +++++++++++++++ client/src/unifyfs_api_transfer.c | 144 ++++++ common/src/unifyfs_configurator.c | 379 +++++++++----- common/src/unifyfs_configurator.h | 116 +++-- common/src/unifyfs_logio.c | 7 +- common/src/unifyfs_logio.h | 6 +- common/src/unifyfs_meta.h | 2 +- docs/configuration.rst | 32 +- docs/index.rst | 1 + docs/library_api.rst | 360 +++++++++++++ scripts/checkpatch.sh | 3 +- server/src/unifyfs_fops_mdhim.c | 10 +- server/src/unifyfs_fops_rpc.c | 5 +- server/src/unifyfs_request_manager.c | 3 + server/src/unifyfs_server.c | 2 +- t/0700-unifyfs-stage-full.t | 53 +- t/8000-client-api.t | 8 + t/9300-unifyfs-stage-isolated.t | 52 +- t/Makefile.am | 28 +- t/api/client_api_suite.c | 72 +++ t/api/client_api_suite.h | 56 +++ t/api/create-open-remove.c | 99 ++++ t/api/init-fini.c | 47 ++ t/api/laminate.c | 193 +++++++ t/api/write-read-sync-stat.c | 392 +++++++++++++++ t/lib/testutil.c | 92 ++++ t/lib/testutil.h | 19 + t/sharness.d/00-test-env.sh | 3 + t/sharness.d/01-unifyfs-settings.sh | 4 +- .../src/unifyfs-stage-transfer.c | 25 +- 50 files changed, 3698 insertions(+), 715 deletions(-) create mode 100644 client/src/client_transfer.c create mode 100644 client/src/client_transfer.h create mode 100644 client/src/unifyfs_api.c create mode 100644 client/src/unifyfs_api.h create mode 100644 client/src/unifyfs_api_file.c create mode 100644 client/src/unifyfs_api_internal.h create mode 100644 client/src/unifyfs_api_io.c create mode 100644 client/src/unifyfs_api_transfer.c create mode 100644 docs/library_api.rst create mode 100755 t/8000-client-api.t create mode 100644 t/api/client_api_suite.c create mode 100644 t/api/client_api_suite.h create mode 100644 t/api/create-open-remove.c create mode 100644 t/api/init-fini.c create mode 100644 t/api/laminate.c create mode 100644 t/api/write-read-sync-stat.c diff --git a/.travis.yml b/.travis.yml index 517765347..87d1e68aa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,6 +10,7 @@ addons: - automake - build-essential - cmake + - diffutils - gfortran - libhdf5-openmpi-dev - libopenmpi-dev diff --git a/client/src/Makefile.am b/client/src/Makefile.am index 83c645584..2ff8951b1 100644 --- a/client/src/Makefile.am +++ b/client/src/Makefile.am @@ -1,6 +1,7 @@ include $(top_srcdir)/common/src/Makefile.mk -lib_LTLIBRARIES = libunifyfs.la +lib_LTLIBRARIES = libunifyfs.la libunifyfs_api.la + libunifyfsdir = $(includedir) if HAVE_GOTCHA @@ -15,6 +16,7 @@ endif #HAVE_GOTCHA AM_CFLAGS = -Wall -Wno-strict-aliasing -Werror include_HEADERS = unifyfs.h $(UNIFYFS_COMMON_INSTALL_HDRS) +pkginclude_HEADERS = unifyfs_api.h if HAVE_GOTCHA if HAVE_FORTRAN @@ -24,6 +26,7 @@ endif #HAVE_GOTCHA CLIENT_COMMON_CPPFLAGS = \ -I$(top_builddir)/client \ + -I$(top_srcdir)/client/include \ -I$(top_srcdir)/common/src CLIENT_COMMON_CFLAGS = \ @@ -39,33 +42,55 @@ CLIENT_COMMON_LIBADD = \ $(UNIFYFS_COMMON_LIBS) \ -lm -lrt -lcrypto -lpthread -CLIENT_COMMON_SOURCES = \ +CLIENT_API_SRC_FILES = \ + unifyfs_api.h \ + unifyfs_api_internal.h \ + unifyfs_api.c \ + unifyfs_api_file.c \ + unifyfs_api_io.c \ + unifyfs_api_transfer.c + +CLIENT_CORE_SRC_FILES = \ + $(OPT_SRCS) \ $(UNIFYFS_COMMON_SRCS) \ client_read.c \ client_read.h \ + client_transfer.c \ + client_transfer.h \ margo_client.c \ margo_client.h \ unifyfs.c \ unifyfs.h \ - unifyfs-dirops.h \ - unifyfs-dirops.c \ unifyfs-fixed.c \ unifyfs-fixed.h \ unifyfs-internal.h \ - unifyfs-stdio.c \ - unifyfs-stdio.h \ - unifyfs-sysio.c \ - unifyfs-sysio.h \ uthash.h \ utlist.h if USE_PMPI_WRAPPERS -CLIENT_COMMON_SOURCES += \ - pmpi_wrappers.c \ - pmpi_wrappers.h +CLIENT_CORE_SRC_FILES += pmpi_wrappers.c pmpi_wrappers.h endif -libunifyfs_la_SOURCES = $(CLIENT_COMMON_SOURCES) +CLIENT_POSIX_SRC_FILES = \ + unifyfs-dirops.c \ + unifyfs-dirops.h \ + unifyfs-stdio.c \ + unifyfs-stdio.h \ + unifyfs-sysio.c \ + unifyfs-sysio.h + +libunifyfs_api_la_SOURCES = \ + $(CLIENT_API_SRC_FILES) \ + $(CLIENT_CORE_SRC_FILES) \ + $(CLIENT_POSIX_SRC_FILES) +libunifyfs_api_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) +libunifyfs_api_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) +libunifyfs_api_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) +libunifyfs_api_la_LIBADD = $(CLIENT_COMMON_LIBADD) + +libunifyfs_la_SOURCES = \ + $(CLIENT_CORE_SRC_FILES) \ + $(CLIENT_POSIX_SRC_FILES) libunifyfs_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) libunifyfs_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfs_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) @@ -73,7 +98,10 @@ libunifyfs_la_LIBADD = $(CLIENT_COMMON_LIBADD) if HAVE_GOTCHA -libunifyfs_gotcha_la_SOURCES = $(CLIENT_COMMON_SOURCES) gotcha_map_unifyfs_list.c +libunifyfs_gotcha_la_SOURCES = \ + $(CLIENT_CORE_SRC_FILES) \ + $(CLIENT_POSIX_SRC_FILES) \ + gotcha_map_unifyfs_list.c libunifyfs_gotcha_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) -DUNIFYFS_GOTCHA libunifyfs_gotcha_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) $(GOTCHA_CFLAGS) libunifyfs_gotcha_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) $(GOTCHA_LDFLAGS) diff --git a/client/src/client_read.c b/client/src/client_read.c index 46ed83843..3166e8ed2 100644 --- a/client/src/client_read.c +++ b/client/src/client_read.c @@ -477,6 +477,14 @@ int compare_read_req(const void* a, const void* b) */ int process_gfid_reads(read_req_t* in_reqs, int in_count) { + if (0 == in_count) { + return UNIFYFS_SUCCESS; + } + + if (NULL == in_reqs) { + return EINVAL; + } + int i, rc, read_rc; /* assume we'll succeed */ diff --git a/client/src/client_transfer.c b/client/src/client_transfer.c new file mode 100644 index 000000000..b8a6b3072 --- /dev/null +++ b/client/src/client_transfer.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "client_transfer.h" +#include "unifyfs-sysio.h" + +static +int do_transfer_data(int fd_src, + int fd_dst, + off_t offset, + size_t count) +{ + int ret = UNIFYFS_SUCCESS; + int err; + off_t pos = 0; + ssize_t n_written = 0; + ssize_t n_read = 0; + ssize_t n_processed = 0; + size_t len = UNIFYFS_TX_BUFSIZE; + char* buf = NULL; + + buf = malloc(UNIFYFS_TX_BUFSIZE); + if (NULL == buf) { + LOGERR("failed to allocate transfer buffer"); + return ENOMEM; + } + + errno = 0; + pos = UNIFYFS_WRAP(lseek)(fd_src, offset, SEEK_SET); + err = errno; + if (pos == (off_t) -1) { + LOGERR("lseek failed (%d: %s)\n", err, strerror(err)); + ret = err; + goto out; + } + + errno = 0; + pos = UNIFYFS_WRAP(lseek)(fd_dst, offset, SEEK_SET); + err = errno; + if (pos == (off_t) -1) { + LOGERR("lseek failed (%d: %s)\n", err, strerror(err)); + ret = err; + goto out; + } + + while (count > n_processed) { + if (len > count) { + len = count; + } + + errno = 0; + n_read = UNIFYFS_WRAP(read)(fd_src, buf, len); + err = errno; + if (n_read == 0) { /* EOF */ + break; + } else if (n_read < 0) { /* error */ + ret = err; + goto out; + } + + do { + errno = 0; + n_written = UNIFYFS_WRAP(write)(fd_dst, buf, n_read); + err = errno; + if (n_written < 0) { + ret = err; + goto out; + } else if ((n_written == 0) && err && (err != EAGAIN)) { + ret = err; + goto out; + } + + n_read -= n_written; + n_processed += n_written; + } while (n_read > 0); + } + +out: + if (NULL != buf) { + free(buf); + } + + return ret; +} + +int do_transfer_file_serial(const char* src, + const char* dst, + struct stat* sb_src, + int direction) +{ + /* NOTE: we currently do not use the @direction */ + + int err; + int ret = UNIFYFS_SUCCESS; + int fd_src = 0; + int fd_dst = 0; + + errno = 0; + fd_src = UNIFYFS_WRAP(open)(src, O_RDONLY); + err = errno; + if (fd_src < 0) { + LOGERR("failed to open() source file %s", src); + return err; + } + + errno = 0; + fd_dst = UNIFYFS_WRAP(open)(dst, O_WRONLY); + err = errno; + if (fd_dst < 0) { + LOGERR("failed to open() destination file %s", dst); + close(fd_src); + return err; + } + + LOGDBG("serial transfer (rank=%d of %d): length=%zu", + client_rank, global_rank_cnt, sb_src->st_size); + + ret = do_transfer_data(fd_src, fd_dst, 0, sb_src->st_size); + if (UNIFYFS_SUCCESS != ret) { + LOGERR("failed to transfer data (ret=%d, %s)", + ret, unifyfs_rc_enum_description(ret)); + } else { + UNIFYFS_WRAP(fsync)(fd_dst); + } + + UNIFYFS_WRAP(close)(fd_dst); + UNIFYFS_WRAP(close)(fd_src); + + return ret; +} + +int do_transfer_file_parallel(const char* src, + const char* dst, + struct stat* sb_src, + int direction) +{ + /* NOTE: we currently do not use the @direction */ + + int err; + int ret = UNIFYFS_SUCCESS; + int fd_src = 0; + int fd_dst = 0; + uint64_t total_chunks = 0; + uint64_t chunk_start = 0; + uint64_t n_chunks_remainder = 0; + uint64_t n_chunks_per_rank = 0; + uint64_t offset = 0; + uint64_t len = 0; + uint64_t size = sb_src->st_size; + uint64_t last_chunk_size = 0; + + /* calculate total number of chunk transfers */ + total_chunks = size / UNIFYFS_TX_BUFSIZE; + last_chunk_size = size % UNIFYFS_TX_BUFSIZE; + if (last_chunk_size) { + total_chunks++; + } + + /* calculate chunks per rank */ + n_chunks_per_rank = total_chunks / global_rank_cnt; + n_chunks_remainder = total_chunks % global_rank_cnt; + + /* + * if the file is smaller than (rank_count * transfer_size), just + * use the serial mode. + * + * FIXME: is this assumption fair even for the large rank count? + */ + if (total_chunks <= (uint64_t)global_rank_cnt) { + if (client_rank == 0) { + LOGDBG("using serial transfer for small file"); + ret = do_transfer_file_serial(src, dst, sb_src, direction); + if (ret) { + LOGERR("do_transfer_file_serial() failed"); + } + } else { + ret = UNIFYFS_SUCCESS; + } + return ret; + } + + errno = 0; + fd_src = UNIFYFS_WRAP(open)(src, O_RDONLY); + err = errno; + if (fd_src < 0) { + LOGERR("failed to open() source file %s", src); + return err; + } + + errno = 0; + fd_dst = UNIFYFS_WRAP(open)(dst, O_WRONLY); + err = errno; + if (fd_dst < 0) { + LOGERR("failed to open() destination file %s", dst); + UNIFYFS_WRAP(close)(fd_src); + return err; + } + + chunk_start = n_chunks_per_rank * client_rank; + offset = chunk_start * UNIFYFS_TX_BUFSIZE; + len = n_chunks_per_rank * UNIFYFS_TX_BUFSIZE; + + LOGDBG("parallel transfer (rank=%d of %d): " + "#chunks=%zu, offset=%zu, length=%zu", + client_rank, global_rank_cnt, + (size_t)n_chunks_per_rank, (size_t)offset, (size_t)len); + + ret = do_transfer_data(fd_src, fd_dst, (off_t)offset, (size_t)len); + if (ret) { + LOGERR("failed to transfer data (ret=%d, %s)", + ret, unifyfs_rc_enum_description(ret)); + } else { + if (n_chunks_remainder && (client_rank < n_chunks_remainder)) { + /* do single chunk transfer per rank of remainder portion */ + len = UNIFYFS_TX_BUFSIZE; + if (last_chunk_size && (client_rank == (n_chunks_remainder - 1))) { + len = last_chunk_size; + } + chunk_start = (total_chunks - n_chunks_remainder) + client_rank; + offset = chunk_start * UNIFYFS_TX_BUFSIZE; + + LOGDBG("parallel transfer (rank=%d of %d): " + "#chunks=1, offset=%zu, length=%zu", + client_rank, global_rank_cnt, + (size_t)offset, (size_t)len); + ret = do_transfer_data(fd_src, fd_dst, (off_t)offset, (size_t)len); + if (ret) { + LOGERR("failed to transfer data (ret=%d, %s)", + ret, unifyfs_rc_enum_description(ret)); + } + } + fsync(fd_dst); + } + + UNIFYFS_WRAP(close)(fd_dst); + UNIFYFS_WRAP(close)(fd_src); + + return ret; +} + +int unifyfs_transfer_file(const char* src, + const char* dst, + int parallel) +{ + int rc, err; + int ret = 0; + int txdir = 0; + struct stat sb_src = { 0, }; + mode_t mode_no_write; + struct stat sb_dst = { 0, }; + int unify_src = 0; + int unify_dst = 0; + + char* src_path = strdup(src); + if (NULL == src_path) { + return -ENOMEM; + } + + char src_upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(src, src_upath)) { + txdir = UNIFYFS_TX_STAGE_OUT; + unify_src = 1; + } + + errno = 0; + rc = UNIFYFS_WRAP(stat)(src, &sb_src); + err = errno; + if (rc < 0) { + return -err; + } + + char dst_path[UNIFYFS_MAX_FILENAME] = { 0, }; + char* pos = dst_path; + pos += sprintf(pos, "%s", dst); + + errno = 0; + rc = UNIFYFS_WRAP(stat)(dst, &sb_dst); + err = errno; + if (rc == 0 && S_ISDIR(sb_dst.st_mode)) { + /* if the given destination path is a directory, append the + * basename of the source file */ + sprintf(pos, "/%s", basename((char*) src_path)); + } + + char dst_upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(dst_path, dst_upath)) { + txdir = UNIFYFS_TX_STAGE_IN; + unify_dst = 1; + } + + if (unify_src + unify_dst != 1) { + // we may fail the operation with EINVAL, but useful for testing + LOGDBG("WARNING: none of pathnames points to unifyfs volume"); + } + + /* for both serial and parallel transfers, use rank 0 client to + * create the destination file using the source file's mode*/ + if (0 == client_rank) { + errno = 0; + int create_flags = O_CREAT | O_WRONLY | O_TRUNC; + int fd = UNIFYFS_WRAP(open)(dst_path, create_flags, sb_src.st_mode); + err = errno; + if (fd < 0) { + LOGERR("failed to create destination file %s", dst); + return -err; + } + close(fd); + } + + if (parallel) { + rc = do_transfer_file_parallel(src_path, dst_path, &sb_src, txdir); + } else { + rc = do_transfer_file_serial(src_path, dst_path, &sb_src, txdir); + } + + if (rc != UNIFYFS_SUCCESS) { + ret = -unifyfs_rc_errno(rc); + } else { + ret = 0; + + /* If the destination file is in UnifyFS, then laminate it so that it + * will be readable by other clients. */ + if (unify_dst) { + /* remove the write bits from the source file's mode bits to set + * the new file mode. use chmod with the new mode to ask for file + * lamination. */ + mode_no_write = (sb_src.st_mode) & ~(0222); + UNIFYFS_WRAP(chmod)(dst_path, mode_no_write); + } + } + + return ret; +} diff --git a/client/src/client_transfer.h b/client/src/client_transfer.h new file mode 100644 index 000000000..09f5d0422 --- /dev/null +++ b/client/src/client_transfer.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs-internal.h" + +/* client transfer (stage-in/out) support */ + +#define UNIFYFS_TX_BUFSIZE (8*(1<<20)) + +enum { + UNIFYFS_TX_STAGE_OUT = 0, + UNIFYFS_TX_STAGE_IN = 1, + UNIFYFS_TX_SERIAL = 0, + UNIFYFS_TX_PARALLEL = 1, +}; + +int do_transfer_file_serial(const char* src, + const char* dst, + struct stat* sb_src, + int direction); + +int do_transfer_file_parallel(const char* src, + const char* dst, + struct stat* sb_src, + int direction); diff --git a/client/src/gotcha_map_unifyfs_list.c b/client/src/gotcha_map_unifyfs_list.c index 01e503f05..690f0be86 100644 --- a/client/src/gotcha_map_unifyfs_list.c +++ b/client/src/gotcha_map_unifyfs_list.c @@ -13,6 +13,8 @@ */ #include "unifyfs-internal.h" +#include "unifyfs-stdio.h" +#include "unifyfs-sysio.h" #include /* define gotcha-specific state to use with our wrapper */ diff --git a/client/src/margo_client.c b/client/src/margo_client.c index a85590b57..a6caa9d30 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -199,7 +199,7 @@ static hg_handle_t create_handle(hg_id_t id) } /* invokes the attach rpc function */ -int invoke_client_attach_rpc(void) +int invoke_client_attach_rpc(unifyfs_cfg_t* clnt_cfg) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -211,7 +211,7 @@ int invoke_client_attach_rpc(void) /* fill in input struct */ unifyfs_attach_in_t in; - fill_client_attach_info(&in); + fill_client_attach_info(clnt_cfg, &in); /* call rpc function */ LOGDBG("invoking the attach rpc function in client"); @@ -245,7 +245,7 @@ int invoke_client_attach_rpc(void) } /* invokes the mount rpc function */ -int invoke_client_mount_rpc(void) +int invoke_client_mount_rpc(unifyfs_cfg_t* clnt_cfg) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -257,7 +257,7 @@ int invoke_client_mount_rpc(void) /* fill in input struct */ unifyfs_mount_in_t in; - fill_client_mount_info(&in); + fill_client_mount_info(clnt_cfg, &in); /* pass our margo address to the server */ in.client_addr_str = strdup(client_rpc_context->client_addr_str); diff --git a/client/src/margo_client.h b/client/src/margo_client.h index 72b3d44cd..b6f6469bf 100644 --- a/client/src/margo_client.h +++ b/client/src/margo_client.h @@ -52,11 +52,13 @@ int unifyfs_client_rpc_init(void); int unifyfs_client_rpc_finalize(void); -void fill_client_attach_info(unifyfs_attach_in_t* in); -int invoke_client_attach_rpc(void); +void fill_client_attach_info(unifyfs_cfg_t* clnt_cfg, + unifyfs_attach_in_t* in); +int invoke_client_attach_rpc(unifyfs_cfg_t* clnt_cfg); -void fill_client_mount_info(unifyfs_mount_in_t* in); -int invoke_client_mount_rpc(void); +void fill_client_mount_info(unifyfs_cfg_t* clnt_cfg, + unifyfs_mount_in_t* in); +int invoke_client_mount_rpc(unifyfs_cfg_t* clnt_cfg); int invoke_client_unmount_rpc(void); diff --git a/client/src/unifyfs-dirops.c b/client/src/unifyfs-dirops.c index 2d0b06946..6c2f29a34 100644 --- a/client/src/unifyfs-dirops.c +++ b/client/src/unifyfs-dirops.c @@ -141,10 +141,12 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) meta = unifyfs_get_meta_from_fid(fid); assert(meta != NULL); - meta->mode = (meta->mode & ~S_IFREG) | S_IFDIR; /* set as directory */ + + /* set as directory */ + meta->attrs.mode = (meta->attrs.mode & ~S_IFREG) | S_IFDIR; } - meta->global_size = sb.st_size; + meta->attrs.size = sb.st_size; unifyfs_dirstream_t* dirp = unifyfs_dirstream_alloc(fid); diff --git a/client/src/unifyfs-fixed.c b/client/src/unifyfs-fixed.c index 76003358a..abb9ec01f 100644 --- a/client/src/unifyfs-fixed.c +++ b/client/src/unifyfs-fixed.c @@ -84,7 +84,7 @@ static int add_write_meta_to_index(unifyfs_filemeta_t* meta, if (count_before >= (unifyfs_max_index_entries - 2)) { /* this will flush our segments, sync them, and set the running * segment count back to 0 */ - unifyfs_sync(meta->fid); + unifyfs_sync_extents(meta->fid); } /* store the write in our segment tree used for syncing with server. */ @@ -124,7 +124,7 @@ off_t unifyfs_rewrite_index_from_seg_tree(unifyfs_filemeta_t* meta) /* record maximum write log offset */ off_t max_log_offset = 0; - int gfid = meta->gfid; + int gfid = meta->attrs.gfid; seg_tree_rdlock(&meta->extents_sync); /* For each write in this file's seg_tree ... */ @@ -189,7 +189,7 @@ int truncate_write_meta(unifyfs_filemeta_t* meta, off_t trunc_sz) * * Returns 0 on success, nonzero otherwise. */ -int unifyfs_sync(int target_fid) +int unifyfs_sync_extents(int target_fid) { int tmp_rc; int ret = UNIFYFS_SUCCESS; @@ -232,11 +232,11 @@ int unifyfs_sync(int target_fid) } /* tell the server to grab our new extents */ - tmp_rc = invoke_client_sync_rpc(meta->gfid); + tmp_rc = invoke_client_sync_rpc(meta->attrs.gfid); if (UNIFYFS_SUCCESS != tmp_rc) { /* something went wrong when trying to flush extents */ LOGERR("failed to flush write index to server for gfid=%d", - meta->gfid); + meta->attrs.gfid); ret = tmp_rc; } @@ -262,7 +262,7 @@ int unifyfs_sync(int target_fid) } /* got an open file, sync this file id */ - tmp_rc = unifyfs_sync(fid); + tmp_rc = unifyfs_sync_extents(fid); if (UNIFYFS_SUCCESS != tmp_rc) { ret = tmp_rc; } diff --git a/client/src/unifyfs-fixed.h b/client/src/unifyfs-fixed.h index 8053e0e7c..288ff923c 100644 --- a/client/src/unifyfs-fixed.h +++ b/client/src/unifyfs-fixed.h @@ -52,7 +52,7 @@ off_t unifyfs_rewrite_index_from_seg_tree(unifyfs_filemeta_t* meta); int truncate_write_meta(unifyfs_filemeta_t* meta, off_t trunc_sz); /* sync all writes for target file(s) with the server */ -int unifyfs_sync(int target_fid); +int unifyfs_sync_extents(int target_fid); /* write data to file using log-based I/O */ int unifyfs_fid_logio_write( diff --git a/client/src/unifyfs-internal.h b/client/src/unifyfs-internal.h index 8789bde38..0344e6eb9 100644 --- a/client/src/unifyfs-internal.h +++ b/client/src/unifyfs-internal.h @@ -275,23 +275,18 @@ enum unifyfs_file_storage { }; typedef struct { - off_t global_size; /* Global size of the file */ + int fid; /* local file index in filemetas array */ + int storage; /* FILE_STORAGE type */ + pthread_spinlock_t fspinlock; /* file lock variable */ enum flock_enum flock_status; /* file lock status */ - int storage; /* FILE_STORAGE type */ - - int fid; /* local file index in filemetas array */ - int gfid; /* global file id for this file */ int needs_sync; /* have unsynced writes */ - - int is_laminated; /* Is this file laminated */ - uint32_t mode; /* st_mode bits. This has file - * permission info and will tell you if this - * is a regular file or directory. */ struct seg_tree extents_sync; /* Segment tree containing our coalesced * writes between sync operations */ struct seg_tree extents; /* Segment tree of all local data extents */ + + unifyfs_file_attr_t attrs; /* UnifyFS and POSIX file attributes */ } unifyfs_filemeta_t; /* struct used to map a full path to its local file id, @@ -346,15 +341,21 @@ typedef struct { * Global variable declarations * ------------------------------- */ + + +extern int global_rank_cnt; /* count of world ranks */ +extern int client_rank; /* client-provided rank (for debugging) */ + +extern int unifyfs_mounted; /* avoid duplicate mounts (for now) */ +extern int unifyfs_app_id; /* application (aka mountpoint) id */ +extern int unifyfs_client_id; /* client id within application */ + extern unifyfs_index_buf_t unifyfs_indices; extern unsigned long unifyfs_max_index_entries; /* log-based I/O context */ extern logio_context* logio_ctx; -extern int unifyfs_app_id; -extern int unifyfs_client_id; - /* whether to return UNIFYFS (true) or TMPFS (false) magic value from statfs */ extern bool unifyfs_super_magic; @@ -406,6 +407,9 @@ extern bool unifyfs_local_extents; /* enable tracking of local extents */ * Common functions * ------------------------------- */ +int unifyfs_init(unifyfs_cfg_t* clnt_cfg); +int unifyfs_fini(void); + /* single function to route all unsupported wrapper calls through */ int unifyfs_unsupported(const char* fn_name, const char* file, int line, const char* fmt, ...); @@ -549,9 +553,12 @@ int unifyfs_fid_open(const char* path, int flags, mode_t mode, int* outfid, int unifyfs_fid_close(int fid); -/* delete a file id and return file its resources to free pools */ +/* unlink file and then delete its associated state */ int unifyfs_fid_unlink(int fid); +/* delete a file id, free its local storage resources, and return + * the file id to free stack */ +int unifyfs_fid_delete(int fid); /* global metadata functions */ @@ -565,10 +572,4 @@ int unifyfs_set_global_file_meta(int gfid, int unifyfs_get_global_file_meta(int gfid, unifyfs_file_attr_t* gfattr); -// These require types/structures defined above -#include "unifyfs-fixed.h" -#include "unifyfs-stdio.h" -#include "unifyfs-sysio.h" -#include "unifyfs-dirops.h" - #endif /* UNIFYFS_INTERNAL_H */ diff --git a/client/src/unifyfs-stdio.h b/client/src/unifyfs-stdio.h index e8bd68608..d653ddd42 100644 --- a/client/src/unifyfs-stdio.h +++ b/client/src/unifyfs-stdio.h @@ -104,7 +104,4 @@ UNIFYFS_DECL(getwc, wint_t, (FILE* stream)); UNIFYFS_DECL(putwc, wint_t, (wchar_t c, FILE* stream)); UNIFYFS_DECL(ungetwc, wint_t, (wint_t c, FILE* stream)); -UNIFYFS_DECL(chmod, int, (const char* path, mode_t mode)); -UNIFYFS_DECL(fchmod, int, (int fd, mode_t mode)); - #endif /* UNIFYFS_STDIO_H */ diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index 4bdc63269..8d93f7429 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -2142,7 +2142,7 @@ static int __chmod(int fid, mode_t mode) } /* Once a file is laminated, you can't modify it in any way */ - if (meta->is_laminated) { + if (meta->attrs.is_laminated) { LOGDBG("%s is already laminated", path); errno = EROFS; return -1; @@ -2158,8 +2158,8 @@ static int __chmod(int fid, mode_t mode) * meta->mode & 0222 Was at least one write bit set before? * ((meta->mode & 0222) & mode) == 0 Will all the write bits be cleared? */ - if ((meta->mode & 0222) && - (((meta->mode & 0222) & mode) == 0)) { + if ((meta->attrs.mode & 0222) && + (((meta->attrs.mode & 0222) & mode) == 0)) { /* We're laminating. */ ret = invoke_client_laminate_rpc(gfid); if (ret) { @@ -2170,8 +2170,8 @@ static int __chmod(int fid, mode_t mode) } /* Clear out our old permission bits, and set the new ones in */ - meta->mode = meta->mode & ~0777; - meta->mode = meta->mode | mode; + meta->attrs.mode = meta->attrs.mode & ~0777; + meta->attrs.mode = meta->attrs.mode | mode; /* update the global meta data to reflect new permissions */ unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CHMOD; diff --git a/client/src/unifyfs-sysio.h b/client/src/unifyfs-sysio.h index a6e93939e..8f7ac4f6e 100644 --- a/client/src/unifyfs-sysio.h +++ b/client/src/unifyfs-sysio.h @@ -56,6 +56,7 @@ UNIFYFS_DECL(access, int, (const char* pathname, int mode)); UNIFYFS_DECL(mkdir, int, (const char* path, mode_t mode)); UNIFYFS_DECL(rmdir, int, (const char* path)); UNIFYFS_DECL(chdir, int, (const char* path)); +UNIFYFS_DECL(chmod, int, (const char* path, mode_t mode)); UNIFYFS_DECL(__getcwd_chk, char*, (char* path, size_t, size_t)); UNIFYFS_DECL(getcwd, char*, (char* path, size_t)); UNIFYFS_DECL(getwd, char*, (char* path)); @@ -94,6 +95,7 @@ UNIFYFS_DECL(posix_fadvise, int, (int fd, off_t offset, off_t len, int advice)); UNIFYFS_DECL(lseek, off_t, (int fd, off_t offset, int whence)); UNIFYFS_DECL(lseek64, off64_t, (int fd, off64_t offset, int whence)); UNIFYFS_DECL(fchdir, int, (int fd)); +UNIFYFS_DECL(fchmod, int, (int fd, mode_t mode)); UNIFYFS_DECL(ftruncate, int, (int fd, off_t length)); UNIFYFS_DECL(fstat, int, (int fd, struct stat* buf)); UNIFYFS_DECL(__fxstat, int, (int vers, int fd, struct stat* buf)); diff --git a/client/src/unifyfs.c b/client/src/unifyfs.c index 719b12c63..87edfcbd3 100644 --- a/client/src/unifyfs.c +++ b/client/src/unifyfs.c @@ -42,6 +42,7 @@ #include "unifyfs.h" #include "unifyfs-internal.h" +#include "unifyfs-fixed.h" #include "client_read.h" // client-server rpc headers @@ -54,12 +55,12 @@ #endif /* HAVE_SPATH */ /* avoid duplicate mounts (for now) */ -static int unifyfs_mounted = -1; +int unifyfs_mounted = -1; /* whether we can use fgetpos/fsetpos */ static int unifyfs_fpos_enabled = 1; -unifyfs_cfg_t client_cfg; +static unifyfs_cfg_t client_cfg; unifyfs_index_buf_t unifyfs_indices; static size_t unifyfs_index_buf_size; /* size of metadata log */ @@ -68,8 +69,8 @@ unsigned long unifyfs_max_index_entries; /* max metadata log entries */ int global_rank_cnt; /* count of world ranks */ int client_rank; /* client-provided rank (for debugging) */ -int unifyfs_app_id; -int unifyfs_client_id; +int unifyfs_app_id; /* application (aka mountpoint) id */ +int unifyfs_client_id; /* client id within application */ static int unifyfs_use_single_shm = 0; static int unifyfs_page_size = 0; @@ -508,7 +509,7 @@ int unifyfs_fid_is_laminated(int fid) { unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); if ((meta != NULL) && (meta->fid == fid)) { - return meta->is_laminated; + return meta->attrs.is_laminated; } return 0; } @@ -561,7 +562,7 @@ static int fid_store_alloc(int fid) } /* free data management resource for file */ -static int fid_store_free(int fid) +static int fid_storage_free(int fid) { /* get meta data for this file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); @@ -595,12 +596,10 @@ static int fid_store_free(int fid) int unifyfs_fid_is_dir(int fid) { unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if ((meta != NULL) && (meta->mode & S_IFDIR)) { + if ((meta != NULL) && (meta->attrs.mode & S_IFDIR)) { return 1; - } else { - /* if it doesn't exist, then it's not a directory? */ - return 0; } + return 0; } int unifyfs_gfid_from_fid(const int fid) @@ -613,7 +612,7 @@ int unifyfs_gfid_from_fid(const int fid) /* return global file id, cached in file meta struct */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); if (meta != NULL) { - return meta->gfid; + return meta->attrs.gfid; } else { return -1; } @@ -626,7 +625,7 @@ int unifyfs_fid_from_gfid(int gfid) int i; for (i = 0; i < unifyfs_max_files; i++) { if (unifyfs_filelist[i].in_use && - unifyfs_filemetas[i].gfid == gfid) { + unifyfs_filemetas[i].attrs.gfid == gfid) { /* found a file id that's in use and it matches * the target fid, this is the one */ return i; @@ -683,7 +682,7 @@ off_t unifyfs_fid_global_size(int fid) /* get meta data for this file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); if (meta != NULL) { - return meta->global_size; + return meta->attrs.size; } return (off_t)-1; } @@ -744,32 +743,22 @@ off_t unifyfs_gfid_filesize(int gfid) return filesize; } -/* Update local metadata for file from global metadata. - * Currently, this updates the is_laminated flag, and if - * the file is laminated, it also updates the global_size value */ +/* Update local metadata for file from global metadata */ int unifyfs_fid_update_file_meta(int fid, unifyfs_file_attr_t* gfattr) { if (NULL == gfattr) { - return UNIFYFS_FAILURE; + return EINVAL; } /* lookup local metadata for file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); if (meta != NULL) { - /* update lamination state */ - meta->is_laminated = gfattr->is_laminated; - if (meta->is_laminated) { - /* file has been laminated, record its size - * for local lookup since it won't change */ - meta->global_size = (off_t)gfattr->size; - LOGDBG("laminated file size is %zu bytes", - (size_t)meta->global_size); - } + meta->attrs = *gfattr; return UNIFYFS_SUCCESS; } /* else, bad fid */ - return UNIFYFS_FAILURE; + return EINVAL; } /* @@ -822,8 +811,8 @@ int unifyfs_get_global_file_meta(int gfid, unifyfs_file_attr_t* gfattr) } /* - * Set the metadata values for a file (after optionally creating it), - * using metadata associated with a given local file id. + * Set the global metadata values for a file using local file + * attributes associated with the given local file id. * * fid: The local file id on which to base global metadata values. * @@ -842,43 +831,19 @@ int unifyfs_set_global_file_meta_from_fid(int fid, unifyfs_file_attr_op_e op) unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); assert(meta != NULL); - /* get file name */ - char* filename = (char*) unifyfs_path_from_fid(fid); - /* set global file id */ - fattr.gfid = meta->gfid; + fattr.gfid = meta->attrs.gfid; LOGDBG("setting global file metadata for fid:%d gfid:%d path:%s", - fid, fattr.gfid, filename); + fid, fattr.gfid, meta->attrs.filename); - /* use current time for atime/mtime/ctime */ - struct timespec tp = {0}; - clock_gettime(CLOCK_REALTIME, &tp); - fattr.atime = tp; - fattr.mtime = tp; - fattr.ctime = tp; - - /* copy file mode bits */ - fattr.mode = meta->mode; - - if (op == UNIFYFS_FILE_ATTR_OP_CREATE) { - /* these fields are set by server, except when we're creating a - * new file in which case we should initialize them both to 0 */ - fattr.is_laminated = 0; - fattr.size = 0; - - /* capture current uid and gid */ - fattr.uid = getuid(); - fattr.gid = getgid(); - - fattr.filename = filename; - } + unifyfs_file_attr_update(op, &fattr, &(meta->attrs)); LOGDBG("using following attributes"); debug_print_file_attr(&fattr); /* submit file attributes to global key/value store */ - int ret = unifyfs_set_global_file_meta(meta->gfid, op, &fattr); + int ret = unifyfs_set_global_file_meta(fattr.gfid, op, &fattr); return ret; } @@ -935,17 +900,32 @@ int unifyfs_fid_create_file(const char* path) unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); assert(meta != NULL); - /* initialize meta data */ - meta->global_size = 0; - meta->flock_status = UNLOCKED; - meta->storage = FILE_STORAGE_NULL; + /* initialize file attributes */ + unifyfs_file_attr_set_invalid(&(meta->attrs)); + meta->attrs.gfid = unifyfs_generate_gfid(path); + meta->attrs.size = 0; + meta->attrs.mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; + meta->attrs.is_laminated = 0; + meta->attrs.filename = (char*)&(unifyfs_filelist[fid].filename); + + /* use client user/group */ + meta->attrs.uid = getuid(); + meta->attrs.gid = getgid(); + + /* use current time for atime/mtime/ctime */ + struct timespec tp = {0}; + clock_gettime(CLOCK_REALTIME, &tp); + meta->attrs.atime = tp; + meta->attrs.mtime = tp; + meta->attrs.ctime = tp; + + /* set UnifyFS client metadata */ meta->fid = fid; - meta->gfid = unifyfs_generate_gfid(path); + meta->storage = FILE_STORAGE_NULL; meta->needs_sync = 0; - meta->is_laminated = 0; - meta->mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; /* PTHREAD_PROCESS_SHARED allows Process-Shared Synchronization */ + meta->flock_status = UNLOCKED; pthread_spin_init(&meta->fspinlock, PTHREAD_PROCESS_SHARED); return fid; @@ -967,14 +947,9 @@ int unifyfs_fid_create_directory(const char* path) int found_local = (fid >= 0); /* test whether we have metadata for file in global key/value store */ - int found_global = 0; unifyfs_file_attr_t gfattr = { 0, }; if (unifyfs_get_global_file_meta(gfid, &gfattr) == UNIFYFS_SUCCESS) { - found_global = 1; - } - - /* can't create if it already exists */ - if (found_global) { + /* can't create if it already exists */ return EEXIST; } @@ -998,16 +973,15 @@ int unifyfs_fid_create_directory(const char* path) return EEXIST; } - /* now, we need to create a new directory. */ + /* now, we need to create a new directory. we reuse the file creation + * method and then update the mode to indicate it's a directory */ fid = unifyfs_fid_create_file(path); if (fid < 0) { return -fid; } - - /* Set as directory */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); assert(meta != NULL); - meta->mode = (meta->mode & ~S_IFREG) | S_IFDIR; + meta->attrs.mode = (meta->attrs.mode & ~S_IFREG) | S_IFDIR; /* insert global meta data for directory */ unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; @@ -1023,10 +997,10 @@ int unifyfs_fid_create_directory(const char* path) /* delete a file id, free its local storage resources and return * the file id to free stack */ -static int unifyfs_fid_delete(int fid) +int unifyfs_fid_delete(int fid) { /* finalize the storage we're using for this file */ - int rc = fid_store_free(fid); + int rc = fid_storage_free(fid); if (rc != UNIFYFS_SUCCESS) { /* failed to release structures tracking storage, * bail out to keep its file id active */ @@ -1072,6 +1046,11 @@ int unifyfs_fid_write( unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); assert(meta != NULL); + if (meta->attrs.is_laminated) { + /* attempt to write to laminated file, return read-only filesystem */ + return EROFS; + } + /* determine storage type to write file data */ if (meta->storage == FILE_STORAGE_LOGIO) { /* file stored in logged i/o */ @@ -1083,7 +1062,7 @@ int unifyfs_fid_write( /* optionally sync after every write */ if (unifyfs_write_sync) { - int ret = unifyfs_sync(fid); + int ret = unifyfs_sync_extents(fid); if (ret != UNIFYFS_SUCCESS) { LOGERR("client sync after write failed"); rc = ret; @@ -1109,11 +1088,11 @@ int unifyfs_fid_truncate(int fid, off_t length) assert(meta != NULL); /* truncate is not valid for directories */ - if (S_ISDIR(meta->mode)) { + if (S_ISDIR(meta->attrs.mode)) { return EISDIR; } - if (meta->is_laminated) { + if (meta->attrs.is_laminated) { /* Can't truncate a laminated file */ return EINVAL; } @@ -1137,7 +1116,7 @@ int unifyfs_fid_truncate(int fid, off_t length) /* update global size in filemeta to reflect truncated size. * note that log size is not affected */ - meta->global_size = length; + meta->attrs.size = length; /* invoke truncate rpc */ int gfid = unifyfs_gfid_from_fid(fid); @@ -1161,7 +1140,7 @@ int unifyfs_fid_sync(int fid) /* sync data with server */ if (meta->needs_sync) { - ret = unifyfs_sync(fid); + ret = unifyfs_sync_extents(fid); } return ret; @@ -1415,7 +1394,7 @@ int unifyfs_fid_close(int fid) return UNIFYFS_SUCCESS; } -/* delete a file id and return file its resources to free pools */ +/* unlink file and then delete its associated state */ int unifyfs_fid_unlink(int fid) { int rc; @@ -1432,7 +1411,7 @@ int unifyfs_fid_unlink(int fid) /* finalize the storage we're using for this file */ rc = unifyfs_fid_delete(fid); if (rc != UNIFYFS_SUCCESS) { - /* released strorage for file, but failed to release + /* released storage for file, but failed to release * structures tracking storage, again bail out to keep * its file id active */ return rc; @@ -1610,7 +1589,7 @@ static int init_superblock_shm(size_t super_sz) /* Clear any index entries from the cache. We do this to ensure * the newly allocated seg trees are consistent with the extents - * in the index. It would be nice to call unifyfs_sync to flush + * in the index. It would be nice to call unifyfs_sync_extents to flush * any entries to the server, but we can't do that since that will * try to rewrite the index using the trees, which point to invalid * memory at this point. */ @@ -1641,7 +1620,7 @@ static int init_superblock_shm(size_t super_sz) return UNIFYFS_SUCCESS; } -static int unifyfs_init(void) +int unifyfs_init(unifyfs_cfg_t* clnt_cfg) { int rc; int i; @@ -1681,7 +1660,7 @@ static int unifyfs_init(void) unifyfs_min_long = LONG_MIN; /* set our current working directory if user gave us one */ - cfgval = client_cfg.client_cwd; + cfgval = clnt_cfg->client_cwd; if (cfgval != NULL) { unifyfs_cwd = strdup(cfgval); @@ -1721,7 +1700,7 @@ static int unifyfs_init(void) /* determine max number of files to store in file system */ unifyfs_max_files = UNIFYFS_CLIENT_MAX_FILES; - cfgval = client_cfg.client_max_files; + cfgval = clnt_cfg->client_max_files; if (cfgval != NULL) { rc = configurator_int_val(cfgval, &l); if (rc == 0) { @@ -1732,7 +1711,7 @@ static int unifyfs_init(void) /* Determine if we should track all write extents and use them * to service read requests if all data is local */ unifyfs_local_extents = 0; - cfgval = client_cfg.client_local_extents; + cfgval = clnt_cfg->client_local_extents; if (cfgval != NULL) { rc = configurator_bool_val(cfgval, &b); if (rc == 0) { @@ -1744,7 +1723,7 @@ static int unifyfs_init(void) * This slows write performance, but it can serve as a work * around for apps that do not have all necessary syncs. */ unifyfs_write_sync = false; - cfgval = client_cfg.client_write_sync; + cfgval = clnt_cfg->client_write_sync; if (cfgval != NULL) { rc = configurator_bool_val(cfgval, &b); if (rc == 0) { @@ -1766,7 +1745,7 @@ static int unifyfs_init(void) /* define size of buffer used to cache key/value pairs for * data offsets before passing them to the server */ unifyfs_index_buf_size = UNIFYFS_CLIENT_WRITE_INDEX_SIZE; - cfgval = client_cfg.client_write_index_size; + cfgval = clnt_cfg->client_write_index_size; if (cfgval != NULL) { rc = configurator_int_val(cfgval, &l); if (rc == 0) { @@ -1841,7 +1820,7 @@ static int unifyfs_init(void) /* initialize log-based I/O context */ rc = unifyfs_logio_init_client(unifyfs_app_id, unifyfs_client_id, - &client_cfg, &logio_ctx); + clnt_cfg, &logio_ctx); if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to initialize log-based I/O (rc = %s)", unifyfs_rc_enum_str(rc)); @@ -1858,7 +1837,7 @@ static int unifyfs_init(void) /* free resources allocated during unifyfs_init(). * generally, we do this in reverse order with respect to * how things were initialized */ -static int unifyfs_finalize(void) +int unifyfs_fini(void) { int rc = UNIFYFS_SUCCESS; @@ -1911,14 +1890,16 @@ static int unifyfs_finalize(void) * --------------- */ /* Fill mount rpc input struct with client-side context info */ -void fill_client_mount_info(unifyfs_mount_in_t* in) +void fill_client_mount_info(unifyfs_cfg_t* clnt_cfg, + unifyfs_mount_in_t* in) { in->dbg_rank = client_rank; - in->mount_prefix = strdup(client_cfg.unifyfs_mountpoint); + in->mount_prefix = strdup(clnt_cfg->unifyfs_mountpoint); } /* Fill attach rpc input struct with client-side context info */ -void fill_client_attach_info(unifyfs_attach_in_t* in) +void fill_client_attach_info(unifyfs_cfg_t* clnt_cfg, + unifyfs_attach_in_t* in) { size_t meta_offset = (char*)unifyfs_indices.ptr_num_entries - (char*)shm_super_ctx->addr; @@ -1939,7 +1920,7 @@ void fill_client_attach_info(unifyfs_attach_in_t* in) in->logio_spill_size = logio_ctx->spill_sz; if (logio_ctx->spill_sz) { - in->logio_spill_dir = strdup(client_cfg.logio_spill_dir); + in->logio_spill_dir = strdup(clnt_cfg->logio_spill_dir); } else { in->logio_spill_dir = NULL; } @@ -1981,7 +1962,7 @@ int unifyfs_mount( unifyfs_log_open(NULL); // initialize configuration - rc = unifyfs_config_init(&client_cfg, 0, NULL); + rc = unifyfs_config_init(&client_cfg, 0, NULL, 0, NULL); if (rc) { LOGERR("failed to initialize configuration."); return UNIFYFS_FAILURE; @@ -2031,7 +2012,7 @@ int unifyfs_mount( /* Call client mount rpc function to get client id */ LOGDBG("calling mount rpc"); - rc = invoke_client_mount_rpc(); + rc = invoke_client_mount_rpc(&client_cfg); if (rc != UNIFYFS_SUCCESS) { /* If we fail to connect to the server, bail with an error */ LOGERR("failed to mount to server"); @@ -2040,7 +2021,7 @@ int unifyfs_mount( /* initialize our library using assigned client id, creates shared memory * regions (e.g., superblock and data recv) and inits log-based I/O */ - rc = unifyfs_init(); + rc = unifyfs_init(&client_cfg); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -2048,11 +2029,11 @@ int unifyfs_mount( /* Call client attach rpc function to register our newly created shared * memory and files with server */ LOGDBG("calling attach rpc"); - rc = invoke_client_attach_rpc(); + rc = invoke_client_attach_rpc(&client_cfg); if (rc != UNIFYFS_SUCCESS) { /* If we fail, bail with an error */ LOGERR("failed to attach to server"); - unifyfs_finalize(); + unifyfs_fini(); return rc; } @@ -2064,7 +2045,7 @@ int unifyfs_mount( /* if there was an error, return it */ LOGERR("failed to create directory entry for mount point: `%s'", prefix); - unifyfs_finalize(); + unifyfs_fini(); return UNIFYFS_FAILURE; } } @@ -2091,7 +2072,7 @@ int unifyfs_unmount(void) /* sync any outstanding writes */ LOGDBG("syncing data"); - int rc = unifyfs_sync(-1); + int rc = unifyfs_sync_extents(-1); if (rc != UNIFYFS_SUCCESS) { LOGERR("client sync failed"); ret = UNIFYFS_FAILURE; @@ -2118,7 +2099,7 @@ int unifyfs_unmount(void) ************************/ /* free resources allocated in unifyfs_init */ - unifyfs_finalize(); + unifyfs_fini(); /* free memory tracking our mount prefix string */ if (unifyfs_mount_prefix != NULL) { @@ -2151,290 +2132,3 @@ int unifyfs_unmount(void) return ret; } - -#define UNIFYFS_TX_BUFSIZE (8*(1<<20)) - -enum { - UNIFYFS_TX_STAGE_OUT = 0, - UNIFYFS_TX_STAGE_IN = 1, - UNIFYFS_TX_SERIAL = 0, - UNIFYFS_TX_PARALLEL = 1, -}; - -static -ssize_t do_transfer_data(int fd_src, int fd_dst, off_t offset, size_t count) -{ - ssize_t ret = 0; - off_t pos = 0; - ssize_t n_written = 0; - ssize_t n_left = 0; - ssize_t n_processed = 0; - size_t len = UNIFYFS_TX_BUFSIZE; - char* buf = NULL; - - buf = malloc(UNIFYFS_TX_BUFSIZE); - if (!buf) { - LOGERR("failed to allocate transfer buffer"); - return ENOMEM; - } - - pos = lseek(fd_src, offset, SEEK_SET); - if (pos == (off_t) -1) { - LOGERR("lseek failed (%d: %s)\n", errno, strerror(errno)); - ret = errno; - goto out; - } - - pos = lseek(fd_dst, offset, SEEK_SET); - if (pos == (off_t) -1) { - LOGERR("lseek failed (%d: %s)\n", errno, strerror(errno)); - ret = errno; - goto out; - } - - while (count > n_processed) { - if (len > count) { - len = count; - } - - n_left = read(fd_src, buf, len); - - if (n_left == 0) { /* EOF */ - break; - } else if (n_left < 0) { /* error */ - ret = errno; - goto out; - } - - do { - n_written = write(fd_dst, buf, n_left); - - if (n_written < 0) { - ret = errno; - goto out; - } else if (n_written == 0 && errno && errno != EAGAIN) { - ret = errno; - goto out; - } - - n_left -= n_written; - n_processed += n_written; - } while (n_left); - } - -out: - if (buf) { - free(buf); - buf = NULL; - } - - return ret; -} - -static int do_transfer_file_serial(const char* src, const char* dst, - struct stat* sb_src, int dir) -{ - int ret = 0; - int fd_src = 0; - int fd_dst = 0; - - /* - * for now, we do not use the @dir hint. - */ - - fd_src = open(src, O_RDONLY); - if (fd_src < 0) { - return errno; - } - - fd_dst = open(dst, O_CREAT | O_WRONLY | O_TRUNC, 0644); - if (fd_dst < 0) { - ret = errno; - goto out_close_src; - } - - LOGDBG("serial transfer (%d/%d): offset=0, length=%lu", - client_rank, global_rank_cnt, (unsigned long) sb_src->st_size); - - ret = do_transfer_data(fd_src, fd_dst, 0, sb_src->st_size); - if (ret < 0) { - LOGERR("do_transfer_data failed!"); - } else { - fsync(fd_dst); - } - - close(fd_dst); -out_close_src: - close(fd_src); - - return ret; -} - -static int do_transfer_file_parallel(const char* src, const char* dst, - struct stat* sb_src, int dir) -{ - int ret = 0; - int fd_src = 0; - int fd_dst = 0; - uint64_t total_chunks = 0; - uint64_t chunk_start = 0; - uint64_t remainder = 0; - uint64_t n_chunks = 0; - uint64_t offset = 0; - uint64_t len = 0; - uint64_t size = sb_src->st_size; - - fd_src = open(src, O_RDONLY); - if (fd_src < 0) { - LOGERR("failed to open file %s", src); - return errno; - } - - /* - * if the file is smaller than (rankcount*buffersize), just do with the - * serial mode. - * - * FIXME: is this assumtion fair even for the large rank count? - */ - if ((UNIFYFS_TX_BUFSIZE * global_rank_cnt) > size) { - if (client_rank == 0) { - ret = do_transfer_file_serial(src, dst, sb_src, dir); - if (ret) { - LOGERR("do_transfer_file_parallel failed"); - } - - return ret; - } - } - - total_chunks = size / UNIFYFS_TX_BUFSIZE; - if (size % UNIFYFS_TX_BUFSIZE) { - total_chunks++; - } - - n_chunks = total_chunks / global_rank_cnt; - remainder = total_chunks % global_rank_cnt; - - chunk_start = n_chunks * client_rank; - if (client_rank < remainder) { - chunk_start += client_rank; - n_chunks += 1; - } else { - chunk_start += remainder; - } - - offset = chunk_start * UNIFYFS_TX_BUFSIZE; - - if (client_rank == (global_rank_cnt - 1)) { - len = (n_chunks - 1) * UNIFYFS_TX_BUFSIZE; - remainder = size % UNIFYFS_TX_BUFSIZE; - len += (remainder > 0 ? remainder : UNIFYFS_TX_BUFSIZE); - } else { - len = n_chunks * UNIFYFS_TX_BUFSIZE; - } - - if (len > 0) { - LOGDBG("parallel transfer (%d/%d): " - "nchunks=%lu, offset=%lu, length=%lu", - client_rank, global_rank_cnt, - n_chunks, (unsigned long) offset, (unsigned long) len); - - fd_dst = open(dst, O_WRONLY); - if (fd_dst < 0) { - LOGERR("failed to open file %s", dst); - ret = errno; - goto out_close_src; - } - - ret = do_transfer_data(fd_src, fd_dst, offset, len); - if (ret) { - LOGERR("failed to transfer data (ret=%d, %s)", ret, strerror(ret)); - } else { - fsync(fd_dst); - } - - close(fd_dst); - } - -out_close_src: - close(fd_src); - - return ret; -} - -int unifyfs_transfer_file(const char* src, const char* dst, int parallel) -{ - int ret = 0; - int dir = 0; - struct stat sb_src = { 0, }; - mode_t source_file_mode_write_removed; - struct stat sb_dst = { 0, }; - int unify_src = 0; - int unify_dst = 0; - char dst_path[UNIFYFS_MAX_FILENAME] = { 0, }; - char* pos = dst_path; - char* src_path = strdup(src); - - int local_return_val; - - if (!src_path) { - return -ENOMEM; - } - - char src_upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(src, src_upath)) { - dir = UNIFYFS_TX_STAGE_OUT; - unify_src = 1; - } - - ret = UNIFYFS_WRAP(stat)(src, &sb_src); - if (ret < 0) { - return -errno; - } - - pos += sprintf(pos, "%s", dst); - - char dst_upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(dst, dst_upath)) { - dir = UNIFYFS_TX_STAGE_IN; - unify_dst = 1; - } - - ret = UNIFYFS_WRAP(stat)(dst, &sb_dst); - if (ret == 0 && !S_ISREG(sb_dst.st_mode)) { - if (S_ISDIR(sb_dst.st_mode)) { - sprintf(pos, "/%s", basename((char*) src_path)); - } else { - return -EEXIST; - } - } - - if (unify_src + unify_dst != 1) { - // we may fail the operation with EINVAL, but useful for testing - LOGDBG("WARNING: none of pathnames points to unifyfs volume"); - } - - if (parallel) { - local_return_val = - do_transfer_file_parallel(src_path, dst_path, &sb_src, dir); - } else { - local_return_val = - do_transfer_file_serial(src_path, dst_path, &sb_src, dir); - } - - // We know here that one (but not both) of the constituent files - // is in the unify FS. We just have to decide if the *destination* file is. - // If it is, then now that we've transferred it, we'll set it to be readable - // so that it will be laminated and will be readable by other processes. - if (unify_dst) { - // pull the source file's mode bits, remove all the write bits but leave - // the rest intact and store that new mode. Now that the file has been - // copied into the unify file system, chmod the file to the new - // permission. When unify senses all the write bits are removed it will - // laminate the file. - source_file_mode_write_removed = - (sb_src.st_mode) & ~(0222); - chmod(dst_path, source_file_mode_write_removed); - } - return local_return_val; -} diff --git a/client/src/unifyfs_api.c b/client/src/unifyfs_api.c new file mode 100644 index 000000000..f6f61c160 --- /dev/null +++ b/client/src/unifyfs_api.c @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_api_internal.h" + +/* + * Public Methods + */ + +/* Initialize client's use of UnifyFS */ +// TODO: replace unifyfs_mount() +unifyfs_rc unifyfs_initialize(const char* mountpoint, + unifyfs_cfg_option* options, int n_opts, + unifyfs_handle* fshdl) +{ + if ((NULL == mountpoint) || (NULL == fshdl)) { + return EINVAL; + } + *fshdl = UNIFYFS_INVALID_HANDLE; + + // print log messages to stderr + unifyfs_log_open(NULL); + + unifyfs_client* client; + client = (unifyfs_client*) calloc(1, sizeof(unifyfs_client)); + if (NULL == client) { + LOGERR("failed to allocate client handle"); + return ENOMEM; + } + unifyfs_app_id = unifyfs_generate_gfid(mountpoint); + client->app_id = unifyfs_app_id; + + // initialize configuration + unifyfs_cfg_t* client_cfg = &(client->cfg); + int rc = unifyfs_config_init(client_cfg, 0, NULL, n_opts, options); + if (rc) { + LOGERR("failed to initialize client configuration"); + return rc; + } + client_cfg->ptype = UNIFYFS_CLIENT; + client_cfg->unifyfs_mountpoint = strdup(mountpoint); + unifyfs_mount_prefix = client_cfg->unifyfs_mountpoint; + unifyfs_mount_prefixlen = strlen(unifyfs_mount_prefix); + + // set log level from config + char* cfgval = client_cfg->log_verbosity; + if (cfgval != NULL) { + long l; + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + unifyfs_set_log_level((unifyfs_log_level_t)l); + } + } + + // initialize k-v store access + int kv_rank = 0; + int kv_nranks = 1; + rc = unifyfs_keyval_init(client_cfg, &kv_rank, &kv_nranks); + if (rc) { + LOGERR("failed to initialize kvstore"); + return UNIFYFS_FAILURE; + } + + /* open rpc connection to server */ + rc = unifyfs_client_rpc_init(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to initialize client RPC"); + return rc; + } + + /* Call client mount rpc function to get client id */ + LOGDBG("calling mount rpc"); + rc = invoke_client_mount_rpc(client_cfg); + if (rc != UNIFYFS_SUCCESS) { + /* If we fail to connect to the server, bail with an error */ + LOGERR("failed to mount to server"); + return rc; + } + unifyfs_mounted = unifyfs_app_id; + client->is_mounted = true; + client->client_id = unifyfs_client_id; + + /* initialize our library using assigned client id, creates shared memory + * regions (e.g., superblock and data recv) and inits log-based I/O */ + rc = unifyfs_init(client_cfg); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* Call client attach rpc function to register our newly created shared + * memory and files with server */ + LOGDBG("calling attach rpc"); + rc = invoke_client_attach_rpc(client_cfg); + if (rc != UNIFYFS_SUCCESS) { + /* If we fail, bail with an error */ + LOGERR("failed to attach to server"); + unifyfs_fini(); + return rc; + } + + /* add mount point as a new directory in the file list */ + if (unifyfs_get_fid_from_path(mountpoint) < 0) { + /* no entry exists for mount point, so create one */ + int fid = unifyfs_fid_create_directory(mountpoint); + if (fid < 0) { + /* if there was an error, return it */ + LOGERR("failed to create directory entry for mount point: `%s'", + mountpoint); + unifyfs_fini(); + return UNIFYFS_FAILURE; + } + } + + unifyfs_handle client_hdl = (unifyfs_handle) client; + *fshdl = client_hdl; + return UNIFYFS_SUCCESS; +} + +/* Finalize client's use of UnifyFS */ +// TODO: replace unifyfs_unmount() +unifyfs_rc unifyfs_finalize(unifyfs_handle fshdl) +{ + if (UNIFYFS_INVALID_HANDLE == fshdl) { + return EINVAL; + } + unifyfs_client* client = fshdl; + + int ret = UNIFYFS_SUCCESS; + + if (client->is_mounted) { + /* sync any outstanding writes */ + LOGDBG("syncing data"); + int rc = unifyfs_sync_extents(-1); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("client sync failed"); + ret = rc; + } + + /* invoke unmount rpc to tell server we're disconnecting */ + LOGDBG("calling unmount"); + rc = invoke_client_unmount_rpc(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("client unmount rpc failed"); + ret = rc; + } + + unifyfs_mounted = -1; + } + + /* free resources allocated in client_rpc_init */ + unifyfs_client_rpc_finalize(); + + /************************ + * free our mount point, and detach from structures + * storing data + ************************/ + + /* free resources allocated in unifyfs_init */ + unifyfs_fini(); + + /* free memory tracking our mount prefix string */ + if (unifyfs_mount_prefix != NULL) { + free(unifyfs_mount_prefix); + unifyfs_mount_prefix = NULL; + unifyfs_mount_prefixlen = 0; + client->cfg.unifyfs_mountpoint = NULL; + } + + /************************ + * free configuration values + ************************/ + + /* free global holding current working directory */ + if (unifyfs_cwd != NULL) { + free(unifyfs_cwd); + unifyfs_cwd = NULL; + } + + /* clean up configuration */ + int rc = unifyfs_config_fini(&(client->cfg)); + if (rc != 0) { + LOGERR("unifyfs_config_fini() failed"); + ret = rc; + } + + /* shut down our logging */ + unifyfs_log_close(); + + /* free client structure */ + free(client); + + return ret; +} diff --git a/client/src/unifyfs_api.h b/client/src/unifyfs_api.h new file mode 100644 index 000000000..34d230e9a --- /dev/null +++ b/client/src/unifyfs_api.h @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_API_H +#define UNIFYFS_API_H + +#include +#include +#include +#include + +// libunifyfs_common headers +#include "unifyfs_rc.h" +#include "unifyfs_configurator.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Public Types + */ + +/* UnifyFS file system client (opaque struct) */ +struct unifyfs_client; + +/* UnifyFS file system handle (opaque pointer) */ +typedef struct unifyfs_client* unifyfs_handle; + +/* invalid UnifyFS file system handle */ +#define UNIFYFS_INVALID_HANDLE ((unifyfs_handle)NULL) + + +/* global file id type */ +typedef uint32_t unifyfs_gfid; + +/* a valid gfid generated via MD5 hash will never be zero */ +#define UNIFYFS_INVALID_GFID ((unifyfs_gfid)0) + +/* enumeration of supported I/O request operations */ +typedef enum unifyfs_ioreq_op { + UNIFYFS_IOREQ_NOP = 0, + UNIFYFS_IOREQ_OP_READ, + UNIFYFS_IOREQ_OP_WRITE, + UNIFYFS_IOREQ_OP_SYNC_DATA, + UNIFYFS_IOREQ_OP_SYNC_META, + UNIFYFS_IOREQ_OP_TRUNC, + UNIFYFS_IOREQ_OP_ZERO, +} unifyfs_ioreq_op; + +/* enumeration of I/O request states */ +typedef enum unifyfs_ioreq_state { + UNIFYFS_IOREQ_STATE_INVALID = 0, + UNIFYFS_IOREQ_STATE_IN_PROGRESS, + UNIFYFS_IOREQ_STATE_CANCELED, + UNIFYFS_IOREQ_STATE_COMPLETED +} unifyfs_ioreq_state; + +/* structure to hold I/O request result values */ +typedef struct unifyfs_ioreq_result { + int error; + int rc; + size_t count; +} unifyfs_ioreq_result; + +/* I/O request structure */ +typedef struct unifyfs_io_request { + /* user-specified fields */ + void* user_buf; + size_t nbytes; + off_t offset; + unifyfs_gfid gfid; + unifyfs_ioreq_op op; + + /* async callbacks (not yet supported) + * + * unifyfs_req_notify_fn fn; + * void* notify_user_data; + */ + + /* status/result fields */ + unifyfs_ioreq_state state; + unifyfs_ioreq_result result; + + /* internal fields */ + int _reqid; +} unifyfs_io_request; + +/* enumeration of supported I/O request operations */ +typedef enum unifyfs_transfer_mode { + UNIFYFS_TRANSFER_MODE_INVALID = 0, + UNIFYFS_TRANSFER_MODE_COPY, // simple copy to destination + UNIFYFS_TRANSFER_MODE_MOVE // copy, then remove source +} unifyfs_transfer_mode; + +/* File transfer request structure */ +typedef struct unifyfs_transfer_request { + /* user-specified fields */ + const char* src_path; + const char* dst_path; + unifyfs_transfer_mode mode; + int use_parallel; + + /* async callbacks (not yet supported) + * + * unifyfs_req_notify_fn fn; + * void* notify_user_data; + */ + + /* status/result fields */ + unifyfs_ioreq_state state; + unifyfs_ioreq_result result; + + /* internal fields */ + int _reqid; +} unifyfs_transfer_request; + +/* Global file status struct */ +typedef struct unifyfs_status { + int laminated; + int mode; + off_t local_file_size; + off_t global_file_size; + size_t local_write_nbytes; +} unifyfs_status; + + +/* + * Public Methods + */ + +/* + * Initialize client's use of UnifyFS with given mountpoint + * and configuration. Sets file system handle on success. + * + * @param[in] mountpoint Requested mount prefix + * @param[in] options Array of configuration options + * @param[in] n_opts Size of options array + * @param[out] fshdl Client file system handle + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_initialize(const char* mountpoint, + unifyfs_cfg_option* options, int n_opts, + unifyfs_handle* fshdl); + +/* + * Finalize client's use of UnifyFS. Invalidates given file + * system handle. + * + * @param[in] fshdl Client file system handle + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_finalize(unifyfs_handle fshdl); + +/* + * Create and open a new file in UnifyFS. + * + * @param[in] fshdl Client file system handle + * @param[in] flags File creation flags + * @param[in] filepath Path of file to create + * @param[out] gfid Global file id of created file + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_create(unifyfs_handle fshdl, + const int flags, + const char* filepath, + unifyfs_gfid* gfid); + +/* + * Open an existing file in UnifyFS. + * + * @param[in] fshdl Client file system handle + * @param[in] filepath Path of file to open + * @param[out] gfid Global file id of opened file + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_open(unifyfs_handle fshdl, + const char* filepath, + unifyfs_gfid* gfid); + +/* + * Get global file status. + * + * @param[in] fshdl Client file system handle + * @param[in] gfid Global file id of target file + * @param[out] st File status structure + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_stat(unifyfs_handle fshdl, + const unifyfs_gfid gfid, + unifyfs_status* st); + +/* + * Synchronize client writes with global metadata. After successful + * completion, writes will be visible to other clients. + * + * @param[in] fshdl Client file system handle + * @param[in] gfid Global file id of target file + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_sync(unifyfs_handle fshdl, + const unifyfs_gfid gfid); + +/* + * Laminate the given file. After successful completion, writes and other + * file state modifying operations will not be permitted by any client. + * + * @param[in] fshdl Client file system handle + * @param[in] filepath Path of file to laminate + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_laminate(unifyfs_handle fshdl, + const char* filepath); + +/* + * Remove an existing file from UnifyFS. + * + * @param[in] fshdl Client file system handle + * @param[in] filepath Path of file to remove + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_remove(unifyfs_handle fshdl, + const char* filepath); + +/* + * Dispatch a set of I/O requests to UnifyFS. + * + * @param[in] fshdl Client file system handle + * @param[in] nreqs Size of I/O requests array + * @param[in] reqs Array of I/O requests + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_io_request* reqs); + +/* + * Cancel a set of outstanding I/O requests. Only requests that + * are still in-progress will be canceled. + * + * @param[in] fshdl Client file system handle + * @param[in] nreqs Size of I/O requests array + * @param[in] reqs Array of I/O requests + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_cancel_io(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_io_request* reqs); + +/* + * Wait for a set of I/O requests to be completed or canceled. When + * a non-zero value is passed for 'waitall', the function will return + * only after all I/O requests in the array have completed. When zero + * is passed for 'waitall', the function will return after as soon as + * any individual request has completed. + * + * @param[in] fshdl Client file system handle + * @param[in] nreqs Size of I/O requests array + * @param[in] reqs Array of I/O requests + * @param[in] waitall Wait-all behavior flag + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_wait_io(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_io_request* reqs, + const int waitall); + +/* + * Dispatch a set of transfer requests to UnifyFS. + * + * @param[in] fshdl Client file system handle + * @param[in] nreqs Size of transfer requests array + * @param[in] reqs Array of transfer requests + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_dispatch_transfer(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_transfer_request* reqs); + +/* + * Cancel a set of outstanding transfer requests. Only transfers that + * are still in-progress will be canceled. + * + * @param[in] fshdl Client file system handle + * @param[in] nreqs Size of transfer requests array + * @param[in] reqs Array of transfer requests + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_cancel_transfer(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_transfer_request* reqs); + +/* + * Wait for a set of transfer requests to be completed or canceled. When + * a non-zero value is passed for waitall, the function will return + * only after all transfer requests in the array have completed. When zero + * is passed for waitall, the function will return after as soon as + * any individual request has completed. + * + * @param[in] fshdl Client file system handle + * @param[in] nreqs Size of transfer requests array + * @param[in] reqs Array of transfer requests + * @param[in] waitall Wait-all behavior flag + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_transfer_request* reqs, + const int waitall); + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // UNIFYFS_API_H diff --git a/client/src/unifyfs_api_file.c b/client/src/unifyfs_api_file.c new file mode 100644 index 000000000..cc61b766d --- /dev/null +++ b/client/src/unifyfs_api_file.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_api_internal.h" + + +/* + * Public Methods + */ + +/* Create and open a new file in UnifyFS */ +unifyfs_rc unifyfs_create(unifyfs_handle fshdl, + const int flags, + const char* filepath, + unifyfs_gfid* gfid) +{ + if ((UNIFYFS_INVALID_HANDLE == fshdl) + || (NULL == filepath) + || (NULL == gfid)) { + return (unifyfs_rc)EINVAL; + } + *gfid = UNIFYFS_INVALID_GFID; + + /* NOTE: the 'flags' parameter is not currently used. it is reserved + * for future indication of file-specific behavior */ + + /* the output parameters of unifyfs_fid_open() are not used here, but + * must be provided */ + int fid = -1; + off_t filepos = -1; + + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + int create_flags = (O_CREAT | O_EXCL); + int rc = unifyfs_fid_open(filepath, create_flags, mode, &fid, &filepos); + if (UNIFYFS_SUCCESS == rc) { + *gfid = unifyfs_generate_gfid(filepath); + } + return (unifyfs_rc)rc; +} + +/* Open an existing file in UnifyFS */ +unifyfs_rc unifyfs_open(unifyfs_handle fshdl, + const char* filepath, + unifyfs_gfid* gfid) +{ + if ((UNIFYFS_INVALID_HANDLE == fshdl) + || (NULL == filepath) + || (NULL == gfid)) { + return (unifyfs_rc)EINVAL; + } + *gfid = UNIFYFS_INVALID_GFID; + + /* the output parameters of unifyfs_fid_open() are not used here, but + * must be provided */ + int fid = -1; + off_t filepos = -1; + + mode_t mode = 0; + int flags = O_RDWR; + int rc = unifyfs_fid_open(filepath, flags, mode, &fid, &filepos); + if (UNIFYFS_SUCCESS == rc) { + *gfid = unifyfs_generate_gfid(filepath); + } + return (unifyfs_rc)rc; +} + +/* Synchronize client writes with server */ +unifyfs_rc unifyfs_sync(unifyfs_handle fshdl, + const unifyfs_gfid gfid) +{ + if ((UNIFYFS_INVALID_HANDLE == fshdl) + || (UNIFYFS_INVALID_GFID == gfid)) { + return (unifyfs_rc)EINVAL; + } + + int fid = unifyfs_fid_from_gfid((int)gfid); + if (-1 == fid) { + return (unifyfs_rc)EINVAL; + } + + int rc = unifyfs_fid_sync(fid); + return (unifyfs_rc)rc; +} + +/* Get global file status */ +unifyfs_rc unifyfs_stat(unifyfs_handle fshdl, + const unifyfs_gfid gfid, + unifyfs_status* st) +{ + if ((UNIFYFS_INVALID_HANDLE == fshdl) + || (UNIFYFS_INVALID_GFID == gfid) + || (NULL == st)) { + return (unifyfs_rc)EINVAL; + } + + int fid = unifyfs_fid_from_gfid((int)gfid); + if (-1 == fid) { + return (unifyfs_rc)EINVAL; + } + + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + if (meta == NULL) { + LOGERR("missing local file metadata for gfid=%d", (int)gfid); + return UNIFYFS_FAILURE; + } + + /* get global metadata to pick up current file size */ + unifyfs_file_attr_t attr = {0}; + int rc = unifyfs_get_global_file_meta((int)gfid, &attr); + if (UNIFYFS_SUCCESS != rc) { + LOGERR("missing global file metadata for gfid=%d", (int)gfid); + } else { + /* update local file metadata from global metadata */ + unifyfs_fid_update_file_meta(fid, &attr); + } + + st->global_file_size = meta->attrs.size; + st->laminated = meta->attrs.is_laminated; + st->mode = (int) meta->attrs.mode; + + /* TODO - need new metadata fields to track these */ + st->local_file_size = meta->attrs.size; + st->local_write_nbytes = 0; + return UNIFYFS_SUCCESS; +} + +/* Global lamination - no further writes to file are permitted */ +unifyfs_rc unifyfs_laminate(unifyfs_handle fshdl, + const char* filepath) +{ + if ((UNIFYFS_INVALID_HANDLE == fshdl) + || (NULL == filepath)) { + return (unifyfs_rc)EINVAL; + } + + int gfid = unifyfs_generate_gfid(filepath); + int rc = invoke_client_laminate_rpc(gfid); + if (UNIFYFS_SUCCESS == rc) { + /* update the local state for this file (if any) */ + int fid = unifyfs_fid_from_gfid((int)gfid); + if (-1 != fid) { + /* get global metadata to pick up file size and laminated flag */ + unifyfs_file_attr_t attr = {0}; + rc = unifyfs_get_global_file_meta(gfid, &attr); + if (UNIFYFS_SUCCESS != rc) { + LOGERR("missing global metadata for %s (gfid:%d)", + filepath, gfid); + } else { + /* update local file metadata from global metadata */ + unifyfs_fid_update_file_meta(fid, &attr); + } + } + } + + return (unifyfs_rc)rc; +} + +/* Remove an existing file from UnifyFS */ +unifyfs_rc unifyfs_remove(unifyfs_handle fshdl, + const char* filepath) +{ + if ((UNIFYFS_INVALID_HANDLE == fshdl) + || (NULL == filepath)) { + return (unifyfs_rc)EINVAL; + } + + unifyfs_rc ret = UNIFYFS_SUCCESS; + + /* invoke unlink rpc */ + int gfid = unifyfs_generate_gfid(filepath); + int rc = invoke_client_unlink_rpc(gfid); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + } + + /* clean up the local state for this file (if any) */ + int fid = unifyfs_fid_from_gfid(gfid); + if (-1 != fid) { + rc = unifyfs_fid_delete(fid); + if (rc != UNIFYFS_SUCCESS) { + /* released storage for file, but failed to release + * structures tracking storage, again bail out to keep + * its file id active */ + ret = rc; + } + } + + return ret; +} diff --git a/client/src/unifyfs_api_internal.h b/client/src/unifyfs_api_internal.h new file mode 100644 index 000000000..b2040591e --- /dev/null +++ b/client/src/unifyfs_api_internal.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_API_INTERNAL_H +#define UNIFYFS_API_INTERNAL_H + +#include "unifyfs_api.h" +#include "unifyfs-internal.h" +#include "unifyfs-fixed.h" + +// client-server rpc headers +#include "unifyfs_client_rpcs.h" +#include "unifyfs_rpc_util.h" +#include "margo_client.h" + +/* UnifyFS file system client structure */ +typedef struct unifyfs_client { + int app_id; /* application id (gfid for mountpoint) */ + int client_id; /* client id within application */ + + bool is_mounted; /* has client mounted? */ + + unifyfs_cfg_t cfg; /* client configuration */ +} unifyfs_client; + +#endif // UNIFYFS_API_INTERNAL_H diff --git a/client/src/unifyfs_api_io.c b/client/src/unifyfs_api_io.c new file mode 100644 index 000000000..e0df90d00 --- /dev/null +++ b/client/src/unifyfs_api_io.c @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include // usleep + +#include "unifyfs_api_internal.h" +#include "client_read.h" + + +/* + * Private Methods + */ + +static int process_gfid_writes(unifyfs_io_request* wr_reqs, + size_t n_reqs) +{ + int ret = UNIFYFS_SUCCESS; + + size_t i; + for (i = 0; i < n_reqs; i++) { + unifyfs_io_request* req = wr_reqs + i; + + int fid = unifyfs_fid_from_gfid(req->gfid); + if (-1 == fid) { + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->result.error = EINVAL; + continue; + } + + if (req->op == UNIFYFS_IOREQ_OP_ZERO) { + /* TODO: support OP_ZERO in a more efficient manner */ + req->user_buf = calloc(1, req->nbytes); + if (NULL == req->user_buf) { + ret = ENOMEM; + continue; + } + } + + /* write user buffer to file */ + int rc = unifyfs_fid_write(fid, req->offset, req->user_buf, + req->nbytes, &(req->result.count)); + if (rc != UNIFYFS_SUCCESS) { + req->result.error = rc; + } + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + + if (req->op == UNIFYFS_IOREQ_OP_ZERO) { + /* cleanup allocated OP_ZERO buffer */ + free(req->user_buf); + req->user_buf = NULL; + } + } + + return ret; +} + +static int process_gfid_truncates(unifyfs_io_request* tr_reqs, + size_t n_reqs) +{ + int ret = UNIFYFS_SUCCESS; + + size_t i; + for (i = 0; i < n_reqs; i++) { + unifyfs_io_request* req = tr_reqs + i; + + int fid = unifyfs_fid_from_gfid(req->gfid); + if (-1 == fid) { + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->result.error = EINVAL; + } + + int rc = unifyfs_fid_truncate(fid, req->offset); + if (rc != UNIFYFS_SUCCESS) { + req->result.error = rc; + } + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + } + + return ret; +} + +static int process_gfid_syncs(unifyfs_io_request* s_reqs, + size_t n_reqs) +{ + int ret = UNIFYFS_SUCCESS; + int rc; + int data_sync_completed = 0; + size_t i; + for (i = 0; i < n_reqs; i++) { + unifyfs_io_request* req = s_reqs + i; + + if (req->op == UNIFYFS_IOREQ_OP_SYNC_META) { + int fid = unifyfs_fid_from_gfid(req->gfid); + if (-1 == fid) { + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->result.error = EINVAL; + } + + rc = unifyfs_fid_sync(fid); + if (rc != UNIFYFS_SUCCESS) { + req->result.error = rc; + } + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + } else if (req->op == UNIFYFS_IOREQ_OP_SYNC_DATA) { + /* logio_sync covers all files' data - only do it once */ + if (!data_sync_completed) { + rc = unifyfs_logio_sync(logio_ctx); + if (UNIFYFS_SUCCESS != rc) { + req->result.error = rc; + } else { + data_sync_completed = 1; + } + } + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + } + } + + return ret; +} + + +/* + * Public Methods + */ + +/* Dispatch an array of I/O requests */ +unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_io_request* reqs) +{ + if (UNIFYFS_INVALID_HANDLE == fshdl) { + return EINVAL; + } + + if (0 == nreqs) { + return UNIFYFS_SUCCESS; + } else if (NULL == reqs) { + return EINVAL; + } + + unifyfs_io_request* req; + + /* determine counts of various operations */ + size_t n_read = 0; + size_t n_write = 0; + size_t n_trunc = 0; + size_t n_sync = 0; + for (size_t i = 0; i < nreqs; i++) { + req = reqs + i; + switch (req->op) { + case UNIFYFS_IOREQ_NOP: + break; + case UNIFYFS_IOREQ_OP_READ: + n_read++; + break; + case UNIFYFS_IOREQ_OP_WRITE: + case UNIFYFS_IOREQ_OP_ZERO: + n_write++; + break; + case UNIFYFS_IOREQ_OP_SYNC_DATA: + case UNIFYFS_IOREQ_OP_SYNC_META: + n_sync++; + break; + case UNIFYFS_IOREQ_OP_TRUNC: + n_trunc++; + break; + default: + LOGERR("invalid ioreq operation"); + return EINVAL; + } + } + + /* construct per-op requests arrays */ + read_req_t* rd_reqs = NULL; + if (n_read) { + rd_reqs = (read_req_t*) calloc(n_read, sizeof(read_req_t)); + if (NULL == rd_reqs) { + return ENOMEM; + } + } + unifyfs_io_request* wr_reqs = NULL; + if (n_write) { + wr_reqs = (unifyfs_io_request*) + calloc(n_write, sizeof(unifyfs_io_request)); + if (NULL == wr_reqs) { + return ENOMEM; + } + } + unifyfs_io_request* tr_reqs = NULL; + if (n_trunc) { + tr_reqs = (unifyfs_io_request*) + calloc(n_trunc, sizeof(unifyfs_io_request)); + if (NULL == tr_reqs) { + return ENOMEM; + } + } + unifyfs_io_request* s_reqs = NULL; + if (n_sync) { + s_reqs = (unifyfs_io_request*) + calloc(n_sync, sizeof(unifyfs_io_request)); + if (NULL == s_reqs) { + return ENOMEM; + } + } + + size_t i; + size_t rd_ndx = 0; + size_t wr_ndx = 0; + size_t tr_ndx = 0; + size_t s_ndx = 0; + for (i = 0; i < nreqs; i++) { + req = reqs + i; + req->state = UNIFYFS_IOREQ_STATE_IN_PROGRESS; + switch (req->op) { + case UNIFYFS_IOREQ_NOP: + break; + case UNIFYFS_IOREQ_OP_READ: { + read_req_t* rd_req = rd_reqs + rd_ndx++; + rd_req->gfid = req->gfid; + rd_req->offset = req->offset; + rd_req->length = req->nbytes; + rd_req->nread = 0; + rd_req->errcode = 0; + rd_req->buf = req->user_buf; + rd_req->cover_begin_offset = (size_t)-1; + rd_req->cover_end_offset = (size_t)-1; + break; + } + case UNIFYFS_IOREQ_OP_WRITE: + case UNIFYFS_IOREQ_OP_ZERO: { + unifyfs_io_request* wr_req = wr_reqs + wr_ndx++; + *wr_req = *req; + break; + } + case UNIFYFS_IOREQ_OP_SYNC_DATA: + case UNIFYFS_IOREQ_OP_SYNC_META: { + unifyfs_io_request* s_req = s_reqs + s_ndx++; + *s_req = *req; + break; + } + case UNIFYFS_IOREQ_OP_TRUNC: { + unifyfs_io_request* tr_req = tr_reqs + tr_ndx++; + *tr_req = *req; + break; + } + default: + break; + } + } + + /* process reads */ + int rc = process_gfid_reads(rd_reqs, (int)n_read); + if (rc != UNIFYFS_SUCCESS) { + /* error encountered while issuing reads */ + for (i = 0; i < n_read; i++) { + read_req_t* rd_req = rd_reqs + i; + rd_req->errcode = rc; + } + } + + /* process writes */ + rc = process_gfid_writes(wr_reqs, n_write); + if (rc != UNIFYFS_SUCCESS) { + /* error encountered while issuing writes */ + for (i = 0; i < n_write; i++) { + unifyfs_io_request* wr_req = wr_reqs + i; + wr_req->result.error = rc; + } + } + + /* process truncates */ + rc = process_gfid_truncates(tr_reqs, n_trunc); + if (rc != UNIFYFS_SUCCESS) { + /* error encountered while issuing writes */ + for (i = 0; i < n_trunc; i++) { + unifyfs_io_request* tr_req = tr_reqs + i; + tr_req->result.error = rc; + } + } + + /* process syncs */ + rc = process_gfid_syncs(s_reqs, n_sync); + if (rc != UNIFYFS_SUCCESS) { + /* error encountered while issuing writes */ + for (i = 0; i < n_sync; i++) { + unifyfs_io_request* s_req = s_reqs + i; + s_req->result.error = rc; + } + } + + /* update ioreq state */ + rd_ndx = 0; + wr_ndx = 0; + tr_ndx = 0; + s_ndx = 0; + for (i = 0; i < nreqs; i++) { + req = reqs + i; + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + switch (req->op) { + case UNIFYFS_IOREQ_NOP: + break; + case UNIFYFS_IOREQ_OP_READ: { + read_req_t* rd_req = rd_reqs + rd_ndx++; + req->result.count = rd_req->nread; + req->result.error = rd_req->errcode; + break; + } + case UNIFYFS_IOREQ_OP_WRITE: + case UNIFYFS_IOREQ_OP_ZERO: { + unifyfs_io_request* wr_req = wr_reqs + wr_ndx++; + *req = *wr_req; + break; + } + case UNIFYFS_IOREQ_OP_SYNC_DATA: + case UNIFYFS_IOREQ_OP_SYNC_META: { + unifyfs_io_request* s_req = s_reqs + s_ndx++; + *req = *s_req; + break; + } + case UNIFYFS_IOREQ_OP_TRUNC: { + unifyfs_io_request* tr_req = tr_reqs + tr_ndx++; + *req = *tr_req; + break; + } + default: + break; + } + } + + return UNIFYFS_SUCCESS; +} + +/* Cancel an array of I/O requests */ +unifyfs_rc unifyfs_cancel_io(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_io_request* reqs) +{ + if (UNIFYFS_INVALID_HANDLE == fshdl) { + return EINVAL; + } + + if (0 == nreqs) { + return UNIFYFS_SUCCESS; + } else if (NULL == reqs) { + return EINVAL; + } + + return UNIFYFS_ERROR_NYI; +} + +/* Wait for an array of I/O requests to be completed/canceled */ +unifyfs_rc unifyfs_wait_io(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_io_request* reqs, + const int waitall) +{ + if (UNIFYFS_INVALID_HANDLE == fshdl) { + return EINVAL; + } + + if (0 == nreqs) { + return UNIFYFS_SUCCESS; + } else if (NULL == reqs) { + return EINVAL; + } + + size_t i, n_done; + while (1) { + n_done = 0; + for (i = 0; i < nreqs; i++) { + unifyfs_io_request* req = reqs + i; + if ((req->state == UNIFYFS_IOREQ_STATE_CANCELED) || + (req->state == UNIFYFS_IOREQ_STATE_COMPLETED)) { + n_done++; + } + } + if (waitall) { + /* for waitall, all reqs must be done to finish */ + if (n_done == nreqs) { + break; + } + } else if (n_done) { + /* at least one req is done */ + break; + } + usleep(1000); /* sleep 1 ms */ + } + + return UNIFYFS_SUCCESS; +} + + diff --git a/client/src/unifyfs_api_transfer.c b/client/src/unifyfs_api_transfer.c new file mode 100644 index 000000000..558c3de33 --- /dev/null +++ b/client/src/unifyfs_api_transfer.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2020, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + +#include "unifyfs_api_internal.h" +#include "client_transfer.h" + +/* this avoids a #include of */ +extern int unifyfs_transfer_file(const char* src, + const char* dst, + int parallel); + +/* + * Public Methods + */ + +/* Dispatch an array of transfer requests */ +unifyfs_rc unifyfs_dispatch_transfer(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_transfer_request* reqs) +{ + if (UNIFYFS_INVALID_HANDLE == fshdl) { + return EINVAL; + } + + if (nreqs == 0) { + return UNIFYFS_SUCCESS; + } else if (NULL == reqs) { + return EINVAL; + } + + unifyfs_transfer_request* req; + for (size_t i = 0; i < nreqs; i++) { + req = reqs + i; + req->state = UNIFYFS_IOREQ_STATE_IN_PROGRESS; + + /* check for a valid transfer mode */ + switch (req->mode) { + case UNIFYFS_TRANSFER_MODE_COPY: + case UNIFYFS_TRANSFER_MODE_MOVE: + break; + default: + req->result.error = EINVAL; + req->result.rc = UNIFYFS_FAILURE; + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + continue; + } + + int rc = unifyfs_transfer_file(req->src_path, req->dst_path, + req->use_parallel); + if (rc) { + /* unifyfs_transfer_file() returns a negative error code */ + req->result.error = -rc; + req->result.rc = UNIFYFS_FAILURE; + } else { + req->result.error = 0; + req->result.rc = UNIFYFS_SUCCESS; + + if (req->mode == UNIFYFS_TRANSFER_MODE_MOVE) { + /* successful copy, now remove source */ + errno = 0; + rc = unlink(req->src_path); + if (rc) { + req->result.error = errno; + req->result.rc = UNIFYFS_FAILURE; + } + } + } + + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + } + + return UNIFYFS_SUCCESS; +} + +/* Cancel an array of transfer requests */ +unifyfs_rc unifyfs_cancel_transfer(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_transfer_request* reqs) +{ + if (UNIFYFS_INVALID_HANDLE == fshdl) { + return EINVAL; + } + + if (0 == nreqs) { + return UNIFYFS_SUCCESS; + } else if (NULL == reqs) { + return EINVAL; + } + + return UNIFYFS_ERROR_NYI; +} + +/* Wait for an array of transfer requests to be completed/canceled */ +unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, + const size_t nreqs, + unifyfs_transfer_request* reqs, + const int waitall) +{ + if (UNIFYFS_INVALID_HANDLE == fshdl) { + return EINVAL; + } + + if (0 == nreqs) { + return UNIFYFS_SUCCESS; + } else if (NULL == reqs) { + return EINVAL; + } + + size_t i, n_done; + while (1) { + n_done = 0; + for (i = 0; i < nreqs; i++) { + unifyfs_transfer_request* req = reqs + i; + if ((req->state == UNIFYFS_IOREQ_STATE_CANCELED) || + (req->state == UNIFYFS_IOREQ_STATE_COMPLETED)) { + n_done++; + } + } + if (waitall) { + /* for waitall, all reqs must be done to finish */ + if (n_done == nreqs) { + break; + } + } else if (n_done) { + /* at least one req is done */ + break; + } + usleep(1000); /* sleep 1 ms */ + } + + return UNIFYFS_SUCCESS; +} diff --git a/common/src/unifyfs_configurator.c b/common/src/unifyfs_configurator.c index ac5daaed1..1aa6cd6a3 100644 --- a/common/src/unifyfs_configurator.c +++ b/common/src/unifyfs_configurator.c @@ -50,9 +50,9 @@ // initialize configuration using all available methods -int unifyfs_config_init(unifyfs_cfg_t *cfg, - int argc, - char **argv) +int unifyfs_config_init(unifyfs_cfg_t* cfg, + int argc, char** argv, + int nopt, unifyfs_cfg_option* options) { int rc; char *syscfg = NULL; @@ -60,7 +60,7 @@ int unifyfs_config_init(unifyfs_cfg_t *cfg, if (cfg == NULL) return EINVAL; - memset((void *)cfg, 0, sizeof(unifyfs_cfg_t)); + memset((void*)cfg, 0, sizeof(unifyfs_cfg_t)); // set default configuration rc = unifyfs_config_set_defaults(cfg); @@ -81,34 +81,45 @@ int unifyfs_config_init(unifyfs_cfg_t *cfg, // process environment (overrides defaults and system config) rc = unifyfs_config_process_environ(cfg); - if (rc) + if (rc) { + return rc; + } + + // process options array (overrides all previous) + rc = unifyfs_config_process_options(cfg, nopt, options); + if (rc) { return rc; + } // process command-line args (overrides all previous) rc = unifyfs_config_process_cli_args(cfg, argc, argv); - if (rc) + if (rc) { return rc; + } // read config file passed on command-line (does not override cli args) if (cfg->unifyfs_configfile != NULL) { rc = unifyfs_config_process_ini_file(cfg, cfg->unifyfs_configfile); - if (rc) + if (rc) { return rc; + } } // validate settings rc = unifyfs_config_validate(cfg); - if (rc) + if (rc) { return rc; + } return (int)UNIFYFS_SUCCESS; } // cleanup allocated state -int unifyfs_config_fini(unifyfs_cfg_t *cfg) +int unifyfs_config_fini(unifyfs_cfg_t* cfg) { - if (cfg == NULL) + if (cfg == NULL) { return EINVAL; + } #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ if (cfg->sec##_##key != NULL) { \ @@ -150,13 +161,14 @@ int unifyfs_config_fini(unifyfs_cfg_t *cfg) } // print configuration to specified file (or stderr) -void unifyfs_config_print(unifyfs_cfg_t *cfg, - FILE *fp) +void unifyfs_config_print(unifyfs_cfg_t* cfg, + FILE* fp) { char msg[UNIFYFS_CFG_MAX_MSG]; - if (fp == NULL) + if (fp == NULL) { fp = stderr; + } #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ if (cfg->sec##_##key != NULL) { \ @@ -200,11 +212,11 @@ void unifyfs_config_print(unifyfs_cfg_t *cfg, } // print configuration in .ini format to specified file (or stderr) -void unifyfs_config_print_ini(unifyfs_cfg_t *cfg, - FILE *inifp) +void unifyfs_config_print_ini(unifyfs_cfg_t* cfg, + FILE* inifp) { - const char *curr_sec = NULL; - const char *last_sec = NULL; + const char* curr_sec = NULL; + const char* last_sec = NULL; if (inifp == NULL) inifp = stderr; @@ -261,12 +273,13 @@ void unifyfs_config_print_ini(unifyfs_cfg_t *cfg, } // set default values given in UNIFYFS_CONFIGS -int unifyfs_config_set_defaults(unifyfs_cfg_t *cfg) +int unifyfs_config_set_defaults(unifyfs_cfg_t* cfg) { - char *val; + char* val; - if (cfg == NULL) + if (cfg == NULL) { return EINVAL; + } #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ val = stringify(dv); \ @@ -297,7 +310,7 @@ int unifyfs_config_set_defaults(unifyfs_cfg_t *cfg) // utility routine to print CLI usage (and optional usage error message) -void unifyfs_config_cli_usage(char *arg0) +void unifyfs_config_cli_usage(char* arg0) { fprintf(stderr, "USAGE: %s [options]\n", arg0); @@ -323,11 +336,12 @@ void unifyfs_config_cli_usage(char *arg0) } // print usage error message -void unifyfs_config_cli_usage_error(char *arg0, - char *err_msg) +void unifyfs_config_cli_usage_error(char* arg0, + char* err_msg) { - if (err_msg != NULL) + if (err_msg != NULL) { fprintf(stderr, "USAGE ERROR: %s : %s\n\n", arg0, err_msg); + } unifyfs_config_cli_usage(arg0); } @@ -349,9 +363,9 @@ static struct option cli_options[] = { }; // update config struct based on command line args -int unifyfs_config_process_cli_args(unifyfs_cfg_t *cfg, +int unifyfs_config_process_cli_args(unifyfs_cfg_t* cfg, int argc, - char **argv) + char** argv) { int rc, c; int usage_err = 0; @@ -359,14 +373,15 @@ int unifyfs_config_process_cli_args(unifyfs_cfg_t *cfg, int sndx = 0; char errmsg[UNIFYFS_CFG_MAX_MSG]; char short_opts[256]; - extern char *optarg; + extern char* optarg; extern int optind, optopt; - if (cfg == NULL) + if (cfg == NULL) { return EINVAL; + } // setup short_opts and cli_options - memset((void *)short_opts, 0, sizeof(short_opts)); + memset((void*)short_opts, 0, sizeof(short_opts)); short_opts[sndx++] = ':'; // report missing args #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) @@ -471,8 +486,8 @@ int unifyfs_config_process_cli_args(unifyfs_cfg_t *cfg, } // helper to check environment variable -char *getenv_helper(const char *section, - const char *key, +char* getenv_helper(const char* section, + const char* key, unsigned mentry) { static char envname[256]; @@ -480,7 +495,7 @@ char *getenv_helper(const char *section, size_t len; size_t ndx = 0; - memset((void *)envname, 0, sizeof(envname)); + memset((void*)envname, 0, sizeof(envname)); ndx += sprintf(envname, "UNIFYFS_"); @@ -507,12 +522,13 @@ char *getenv_helper(const char *section, // update config struct based on environment variables -int unifyfs_config_process_environ(unifyfs_cfg_t *cfg) +int unifyfs_config_process_environ(unifyfs_cfg_t* cfg) { - char *envval; + char* envval; - if (cfg == NULL) + if (cfg == NULL) { return EINVAL; + } #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ @@ -563,19 +579,20 @@ int unifyfs_config_process_environ(unifyfs_cfg_t *cfg) } // inih callback handler -int inih_config_handler(void *user, - const char *section, - const char *kee, - const char *val) +int inih_config_handler(void* user, + const char* section, + const char* kee, + const char* val) { - char *curval; - char *defval; - unifyfs_cfg_t *cfg = (unifyfs_cfg_t *) user; + char* curval; + char* defval; + unifyfs_cfg_t* cfg = (unifyfs_cfg_t*) user; assert(cfg != NULL); // if not already set by CLI args, set cfg cfgs - if (0) + if (0) { ; + } #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ @@ -621,17 +638,19 @@ UNIFYFS_CONFIGS } // update config struct based on config file, using inih -int unifyfs_config_process_ini_file(unifyfs_cfg_t *cfg, - const char *file) +int unifyfs_config_process_ini_file(unifyfs_cfg_t* cfg, + const char* file) { int rc, inih_rc; char errmsg[UNIFYFS_CFG_MAX_MSG]; - if (cfg == NULL) + if (cfg == NULL) { return EINVAL; + } - if (file == NULL) + if (file == NULL) { return EINVAL; + } inih_rc = ini_parse(file, inih_config_handler, cfg); switch (inih_rc) { @@ -670,39 +689,144 @@ int unifyfs_config_process_ini_file(unifyfs_cfg_t *cfg, return rc; } +// update config struct based on option key-value pair +int unifyfs_config_process_option(unifyfs_cfg_t* cfg, + const char* opt_name, + const char* opt_val) +{ + if ((NULL == cfg) || (NULL == opt_name) || (NULL == opt_val)) { + return EINVAL; + } + + int rc = UNIFYFS_SUCCESS; + char errmsg[UNIFYFS_CFG_MAX_MSG]; + char* curval; + + // split option name into section and key + char* section = NULL; + char* kee = NULL; + char* name_copy = strdup(opt_name); + char* period = strchr(name_copy, '.'); + if (NULL == period) { + rc = EINVAL; + } else { + *period = '\0'; + section = name_copy; + kee = period + 1; + if ((0 == strlen(section)) || + (0 == strlen(kee))) { + rc = EINVAL; + } + } + if (rc != UNIFYFS_SUCCESS) { + snprintf(errmsg, sizeof(errmsg), + "option %s has invalid format - expected '
.'", + opt_name); + fprintf(stderr, "UNIFYFS CONFIG ERROR: %s\n", errmsg); + } else { + // set config for given option (overwrites existing values) + if (0) { + ; + } + +#define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ + else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ + curval = cfg->sec##_##key; \ + if (curval == NULL) \ + cfg->sec##_##key = strdup(opt_val); \ + else { \ + free(cfg->sec##_##key); \ + cfg->sec##_##key = strdup(opt_val); \ + } \ + } + +#define UNIFYFS_CFG_CLI(sec, key, typ, dv, desc, vfn, opt, use) \ + else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ + curval = cfg->sec##_##key; \ + if (curval == NULL) \ + cfg->sec##_##key = strdup(opt_val); \ + else { \ + free(cfg->sec##_##key); \ + cfg->sec##_##key = strdup(opt_val); \ + } \ + } + +#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ + else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ + cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(opt_val); \ + } + +#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ + else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ + cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(opt_val); \ + } + +UNIFYFS_CONFIGS +#undef UNIFYFS_CFG +#undef UNIFYFS_CFG_CLI +#undef UNIFYFS_CFG_MULTI +#undef UNIFYFS_CFG_MULTI_CLI + + } + + free(name_copy); + return rc; +} + +int unifyfs_config_process_options(unifyfs_cfg_t* cfg, + int nopt, + unifyfs_cfg_option* options) +{ + if (nopt > 0) { + if (NULL == options) { + return EINVAL; + } + for (int i = 0; i < nopt; i++) { + unifyfs_cfg_option* opt = options + i; + int rc = unifyfs_config_process_option(cfg, + opt->opt_name, + opt->opt_value); + if (rc) { + return rc; + } + } + } + return UNIFYFS_SUCCESS; +} /* predefined validation functions */ // utility routine to validate a single value given function -int validate_value(const char *section, - const char *key, - const char *val, - const char *typ, +int validate_value(const char* section, + const char* key, + const char* val, + const char* typ, configurator_validate_fn vfn, - char **new_val) + char** new_val) { - if (vfn != NULL) + if (vfn != NULL) { return vfn(section, key, val, new_val); - else if (strcmp(typ, "BOOL") == 0) + } else if (strcmp(typ, "BOOL") == 0) { return configurator_bool_check(section, key, val, NULL); - else if (strcmp(typ, "INT") == 0) + } else if (strcmp(typ, "INT") == 0) { return configurator_int_check(section, key, val, new_val); - else if (strcmp(typ, "FLOAT") == 0) + } else if (strcmp(typ, "FLOAT") == 0) { return configurator_float_check(section, key, val, new_val); - + } return 0; } // validate configuration -int unifyfs_config_validate(unifyfs_cfg_t *cfg) +int unifyfs_config_validate(unifyfs_cfg_t* cfg) { int rc = (int)UNIFYFS_SUCCESS; int vrc; - char *new_val = NULL; + char* new_val = NULL; - if (cfg == NULL) + if (cfg == NULL) { return EINVAL; + } #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ vrc = validate_value(#sec, #key, cfg->sec##_##key, #typ, vfn, &new_val); \ @@ -773,7 +897,7 @@ int unifyfs_config_validate(unifyfs_cfg_t *cfg) return rc; } -int contains_expression(const char *val) +int contains_expression(const char* val) { static char expr_chars[8] = {'(', ')', '+', '-', '*', '/', '%', '^'}; size_t s; @@ -795,11 +919,12 @@ int contains_expression(const char *val) return 0; } -int configurator_bool_val(const char *val, - bool *b) +int configurator_bool_val(const char* val, + bool* b) { - if ((val == NULL) || (b == NULL)) + if ((val == NULL) || (b == NULL)) { return EINVAL; + } if (1 == strlen(val)) { switch (val[0]) { @@ -820,44 +945,45 @@ int configurator_bool_val(const char *val, default: return 1; } - } - else if ((strcmp(val, "no") == 0) - || (strcmp(val, "off") == 0) - || (strcmp(val, "false") == 0)) { + } else if ((strcmp(val, "no") == 0) + || (strcmp(val, "off") == 0) + || (strcmp(val, "false") == 0)) { *b = false; return 0; - } - else if ((strcmp(val, "yes") == 0) - || (strcmp(val, "on") == 0) - || (strcmp(val, "true") == 0)) { + } else if ((strcmp(val, "yes") == 0) + || (strcmp(val, "on") == 0) + || (strcmp(val, "true") == 0)) { *b = true; return 0; } return EINVAL; } -int configurator_bool_check(const char *s, - const char *k, - const char *val, - char **o) +int configurator_bool_check(const char* s, + const char* k, + const char* val, + char** o) { bool b; - if (val == NULL) // unset is OK + if (val == NULL) { + // unset is OK return 0; + } return configurator_bool_val(val, &b); } -int configurator_float_val(const char *val, - double *d) +int configurator_float_val(const char* val, + double* d) { int err; double check, teval; - char *end = NULL; + char* end = NULL; - if ((val == NULL) || (d == NULL)) + if ((val == NULL) || (d == NULL)) { return EINVAL; + } if (contains_expression(val)) { err = 0; @@ -866,14 +992,13 @@ int configurator_float_val(const char *val, check = teval; else return EINVAL; - } - else { + } else { errno = 0; check = strtod(val, &end); err = errno; - if ((err == ERANGE) || (end == val)) + if ((err == ERANGE) || (end == val)) { return EINVAL; - else if (*end != 0) { + } else if (*end != 0) { switch (*end) { case 'f': case 'l': @@ -890,18 +1015,20 @@ int configurator_float_val(const char *val, return 0; } -int configurator_float_check(const char *s, - const char *k, - const char *val, - char **o) +int configurator_float_check(const char* s, + const char* k, + const char* val, + char** o) { int rc; size_t len; double d; - char *newval = NULL; + char* newval = NULL; - if (val == NULL) // unset is OK + if (val == NULL) { + // unset is OK return 0; + } rc = configurator_float_val(val, &d); if ((o != NULL) && (rc == 0) && contains_expression(val)) { @@ -916,16 +1043,17 @@ int configurator_float_check(const char *s, return rc; } -int configurator_int_val(const char *val, - long *l) +int configurator_int_val(const char* val, + long* l) { long check; double teval; int err; - char *end = NULL; + char* end = NULL; - if ((val == NULL) || (l == NULL)) + if ((val == NULL) || (l == NULL)) { return EINVAL; + } if (contains_expression(val)) { err = 0; @@ -934,14 +1062,13 @@ int configurator_int_val(const char *val, check = (long)teval; else return EINVAL; - } - else { + } else { errno = 0; check = strtol(val, &end, 0); err = errno; - if ((err == ERANGE) || (end == val)) + if ((err == ERANGE) || (end == val)) { return EINVAL; - else if (*end != 0) { + } else if (*end != 0) { switch (*end) { case 'l': case 'u': @@ -958,18 +1085,20 @@ int configurator_int_val(const char *val, return 0; } -int configurator_int_check(const char *s, - const char *k, - const char *val, - char **o) +int configurator_int_check(const char* s, + const char* k, + const char* val, + char** o) { int rc; size_t len; long l; - char *newval = NULL; + char* newval = NULL; - if (val == NULL) // unset is OK + if (val == NULL) { + // unset is OK return 0; + } rc = configurator_int_val(val, &l); if ((o != NULL) && (rc == 0) && contains_expression(val)) { @@ -984,52 +1113,56 @@ int configurator_int_check(const char *s, return rc; } -int configurator_file_check(const char *s, - const char *k, - const char *val, - char **o) +int configurator_file_check(const char* s, + const char* k, + const char* val, + char** o) { int rc; struct stat st; - if (val == NULL) + if (val == NULL) { return 0; + } rc = stat(val, &st); if (rc == 0) { - if (st.st_mode & S_IFREG) + if (st.st_mode & S_IFREG) { return 0; - else + } else { return ENOENT; + } } return errno; // invalid } -int configurator_directory_check(const char *s, - const char *k, - const char *val, - char **o) +int configurator_directory_check(const char* s, + const char* k, + const char* val, + char** o) { int mode, rc; struct stat st; - if (val == NULL) + if (val == NULL) { return 0; + } // check dir exists rc = stat(val, &st); if (rc == 0) { - if (st.st_mode & S_IFDIR) + if (st.st_mode & S_IFDIR) { return 0; - else + } else { return ENOTDIR; - } - else { // try to create it + } + } else { // try to create it mode = 0770; // S_IRWXU | S_IRWXG rc = mkdir(val, mode); - if (rc == 0) + if (rc == 0) { return 0; - else + } else { return errno; // invalid + } } } diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index 0254cd405..5ffa9474d 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -98,6 +98,12 @@ extern "C" { #endif +/* UnifyFS config option struct (key-value pair) */ +typedef struct unifyfs_config_option { + const char* opt_name; + const char* opt_value; +} unifyfs_cfg_option; + typedef enum { INVALID_PROCESS_TYPE = 0, UNIFYFS_CLIENT = 1, @@ -132,80 +138,88 @@ typedef struct { /* initialization and cleanup */ -int unifyfs_config_init(unifyfs_cfg_t *cfg, - int argc, - char **argv); +int unifyfs_config_init(unifyfs_cfg_t* cfg, + int argc, char** argv, + int nopt, unifyfs_cfg_option* options); int unifyfs_config_fini(unifyfs_cfg_t *cfg); /* print configuration to specified file (or stderr if fp==NULL) */ -void unifyfs_config_print(unifyfs_cfg_t *cfg, - FILE *fp); +void unifyfs_config_print(unifyfs_cfg_t* cfg, + FILE* fp); /* print configuration in .INI format to specified file (or stderr) */ -void unifyfs_config_print_ini(unifyfs_cfg_t *cfg, - FILE *inifp); +void unifyfs_config_print_ini(unifyfs_cfg_t* cfg, + FILE* inifp); /* used internally, but may be useful externally */ -int unifyfs_config_set_defaults(unifyfs_cfg_t *cfg); +int unifyfs_config_set_defaults(unifyfs_cfg_t* cfg); -void unifyfs_config_cli_usage(char *arg0); -void unifyfs_config_cli_usage_error(char *arg0, - char *err_msg); +void unifyfs_config_cli_usage(char* arg0); +void unifyfs_config_cli_usage_error(char* arg0, + char* err_msg); -int unifyfs_config_process_cli_args(unifyfs_cfg_t *cfg, +int unifyfs_config_process_cli_args(unifyfs_cfg_t* cfg, int argc, - char **argv); + char** argv); + +int unifyfs_config_process_environ(unifyfs_cfg_t* cfg); + +int unifyfs_config_process_ini_file(unifyfs_cfg_t* cfg, + const char* file); -int unifyfs_config_process_environ(unifyfs_cfg_t *cfg); +int unifyfs_config_process_option(unifyfs_cfg_t* cfg, + const char* opt_name, + const char* opt_val); -int unifyfs_config_process_ini_file(unifyfs_cfg_t *cfg, - const char *file); +int unifyfs_config_process_options(unifyfs_cfg_t* cfg, + int nopt, + unifyfs_cfg_option* options); -int unifyfs_config_validate(unifyfs_cfg_t *cfg); +int unifyfs_config_validate(unifyfs_cfg_t* cfg); /* validate function prototype - Returns: 0 for valid input, non-zero otherwise. - out_val: set this output parameter to specify an alternate value */ -typedef int (*configurator_validate_fn)(const char *section, - const char *key, - const char *val, - char **out_val); +typedef int (*configurator_validate_fn)(const char* section, + const char* key, + const char* val, + char** out_val); /* predefined validation functions */ -int configurator_bool_val(const char *val, - bool *b); -int configurator_bool_check(const char *section, - const char *key, - const char *val, - char **oval); - -int configurator_float_val(const char *val, - double *d); -int configurator_float_check(const char *section, - const char *key, - const char *val, - char **oval); - -int configurator_int_val(const char *val, - long *l); -int configurator_int_check(const char *section, - const char *key, - const char *val, - char **oval); - -int configurator_file_check(const char *section, - const char *key, - const char *val, - char **oval); - -int configurator_directory_check(const char *section, - const char *key, - const char *val, - char **oval); +int configurator_bool_val(const char* val, + bool* b); +int configurator_bool_check(const char* section, + const char* key, + const char* val, + char** oval); + +int configurator_float_val(const char* val, + double* d); +int configurator_float_check(const char* section, + const char* key, + const char* val, + char** oval); + +int configurator_int_val(const char* val, + long* l); +int configurator_int_check(const char* section, + const char* key, + const char* val, + char** oval); + +int configurator_file_check(const char* section, + const char* key, + const char* val, + char** oval); + +int configurator_directory_check(const char* section, + const char* key, + const char* val, + char** oval); #ifdef __cplusplus diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c index 03fe23b37..0c33e7369 100644 --- a/common/src/unifyfs_logio.c +++ b/common/src/unifyfs_logio.c @@ -386,7 +386,7 @@ int unifyfs_logio_init_client(const int app_id, /* Close logio context */ int unifyfs_logio_close(logio_context* ctx, - int clean_spill) + int clean_storage) { if (NULL == ctx) { return EINVAL; @@ -395,6 +395,9 @@ int unifyfs_logio_close(logio_context* ctx, int rc; if (NULL != ctx->shmem) { /* release shmem region */ + if (clean_storage) { + unifyfs_shm_unlink(ctx->shmem); + } rc = unifyfs_shm_free(&(ctx->shmem)); if (rc != UNIFYFS_SUCCESS) { LOGERR("Failed to release logio shmem region!"); @@ -422,7 +425,7 @@ int unifyfs_logio_close(logio_context* ctx, } ctx->spill_fd = -1; } - if (clean_spill && (ctx->spill_file != NULL)) { + if (clean_storage && (ctx->spill_file != NULL)) { rc = unlink(ctx->spill_file); if (rc != 0) { int err = errno; diff --git a/common/src/unifyfs_logio.h b/common/src/unifyfs_logio.h index 8a589fe53..cc683ec46 100644 --- a/common/src/unifyfs_logio.h +++ b/common/src/unifyfs_logio.h @@ -71,12 +71,12 @@ int unifyfs_logio_init_client(const int app_id, /** * Close logio context. * - * @param ctx pointer to logio context - * @param clean_spill set to non-zero to have server remove spill file + * @param ctx pointer to logio context + * @param clean_storage set to non-zero to have server remove data storage * @return UNIFYFS_SUCCESS, or error code */ int unifyfs_logio_close(logio_context* ctx, - int clean_spill); + int clean_storage); /** * Allocate write space from logio context. diff --git a/common/src/unifyfs_meta.h b/common/src/unifyfs_meta.h index 24d5dc460..3307f13c8 100644 --- a/common/src/unifyfs_meta.h +++ b/common/src/unifyfs_meta.h @@ -239,7 +239,7 @@ void unifyfs_file_attr_to_stat(unifyfs_file_attr_t* fattr, struct stat* sb) sb->st_size = fattr->size; /* TODO: use cfg.logio_chunk_size here for st_blksize - * and report acutal chunks allocated for st_blocks */ + * and report actual chunks allocated for st_blocks */ sb->st_blksize = UNIFYFS_STAT_DEFAULT_BLKSIZE; sb->st_blocks = fattr->size / UNIFYFS_STAT_DEFAULT_BLKSIZE; if (fattr->size % UNIFYFS_STAT_DEFAULT_BLKSIZE > 0) { diff --git a/docs/configuration.rst b/docs/configuration.rst index 6ec781426..56cc12aa2 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -177,20 +177,20 @@ is used, the value must immediately follow the option character (e.g., ``-Cyes`` .. table:: ``unifyfsd`` command line options :widths: auto - ====================== ======== - LongOpt ShortOpt - ====================== ======== - --unifyfs-cleanup -C - --unifyfs-configfile -f - --unifyfs-consistency -c - --unifyfs-daemonize -D - --unifyfs-mountpoint -m - --log-verbosity -v - --log-file -l - --log-dir -L - --runstate-dir -R - --server-hostfile -H - --sharedfs-dir -S - --server-init_timeout -t - ====================== ======== + ========================= ======== + LongOpt ShortOpt + ========================= ======== + ``--unifyfs-cleanup`` ``-C`` + ``--unifyfs-configfile`` ``-f`` + ``--unifyfs-consistency`` ``-c`` + ``--unifyfs-daemonize`` ``-D`` + ``--unifyfs-mountpoint`` ``-m`` + ``--log-verbosity`` ``-v`` + ``--log-file`` ``-l`` + ``--log-dir`` ``-L`` + ``--runstate-dir`` ``-R`` + ``--server-hostfile`` ``-H`` + ``--sharedfs-dir`` ``-S`` + ``--server-init_timeout`` ``-t`` + ========================= ======== diff --git a/docs/index.rst b/docs/index.rst index 8e548d35a..cf4cc4979 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,6 +20,7 @@ UnifyFS: A file system for burst buffers configuration run examples + library_api .. toctree:: :maxdepth: 2 diff --git a/docs/library_api.rst b/docs/library_api.rst new file mode 100644 index 000000000..6a0177929 --- /dev/null +++ b/docs/library_api.rst @@ -0,0 +1,360 @@ +========================== +UnifyFS Client API Library +========================== + +This section describes the purpose, concepts, and usage of the UnifyFS client +API library. + +------------------ +Client API Purpose +------------------ + +The client API library provides a direct interface for UnifyFS configuration, +namespace management, and batched file I/O and transfer operations. The library +is primarily targeted for use by I/O middleware software such as HDF5 and +VeloC, but is also useful for user applications needing programmatic control +and interactions with UnifyFS. + +------------------- +Client API Concepts +------------------- + +Namespace (aka Mountpoint) +************************** + +All UnifyFS clients provide the mountpoint prefix (e.g., "/unifyfs") that is +used to distinguish the UnifyFS namespace from other file systems available +to the client application. All absolute file paths that include the mountpoint +prefix are treated as belonging to the associated UnifyFS namespace. + +Using the client API, an application or I/O middleware system can operate on +multiple UnifyFS namespaces concurrently. + +File System Handle +****************** + +All client API library methods require a file system handle parameter of type +``unifyfs_handle``. Users obtain a valid handle via an API call to +``unifyfs_initialize()``, which specifies the mountpoint prefix and +configuration settings associated with the handle. + +Multiple handles can be acquired by the same client. This permits access to +multiple namespaces, or different configured behaviors for the same namespace. + +Global File Identifier +********************** + +A global file identifier (gfid) is a unique integer identifier for a given +absolute file path within a UnifyFS namespace. Clients accessing the exact +same file path are guaranteed to obtain the same gfid value when creating or +opening the file. I/O operations use the gfid to identify the target file. + +Note that unlike POSIX file descriptors, a gfid is strictly a unique identifier +and has no associated file state such as a current file position pointer. As +such, it is valid to obtain the gfid for a file in a single process (e.g., via +file creation), and then share the resulting gfid value among other parallel +processes via a collective communication mechanism. + +---------------- +Client API Types +---------------- + +The file system handle type is a pointer to an opaque client structure that +records the associated mountpoint and configuration. + +.. code-block:: C + :caption: File system handle type + + /* UnifyFS file system handle (opaque pointer) */ + typedef struct unifyfs_client* unifyfs_handle; + +I/O requests take the form of a ``unifyfs_io_request`` structure that includes +the target file gfid, the specific I/O operation (``unifyfs_ioreq_op``) to be +applied, and associated operation parameters such as the file offset or user +buffer and size. The structure also contains fields used for tracking the +status of the request (``unifyfs_ioreq_state``) and operation results +(``unifyfs_ioreq_result``). + +.. code-block:: C + :caption: File I/O request types + + /* I/O request structure */ + typedef struct unifyfs_io_request { + /* user-specified fields */ + void* user_buf; + size_t nbytes; + off_t offset; + unifyfs_gfid gfid; + unifyfs_ioreq_op op; + + /* status/result fields */ + unifyfs_ioreq_state state; + unifyfs_ioreq_result result; + } unifyfs_io_request; + + /* enumeration of supported I/O request operations */ + typedef enum unifyfs_ioreq_op { + UNIFYFS_IOREQ_NOP = 0, + UNIFYFS_IOREQ_OP_READ, + UNIFYFS_IOREQ_OP_WRITE, + UNIFYFS_IOREQ_OP_SYNC_DATA, + UNIFYFS_IOREQ_OP_SYNC_META, + UNIFYFS_IOREQ_OP_TRUNC, + UNIFYFS_IOREQ_OP_ZERO, + } unifyfs_ioreq_op; + + /* enumeration of I/O request states */ + typedef enum unifyfs_ioreq_state { + UNIFYFS_IOREQ_STATE_INVALID = 0, + UNIFYFS_IOREQ_STATE_IN_PROGRESS, + UNIFYFS_IOREQ_STATE_CANCELED, + UNIFYFS_IOREQ_STATE_COMPLETED + } unifyfs_ioreq_state; + + /* structure to hold I/O request result values */ + typedef struct unifyfs_ioreq_result { + int error; + int rc; + size_t count; + } unifyfs_ioreq_result; + +For the ``unifyfs_ioreq_result`` structure, successful operations will set the +``rc`` and ``count`` fields as applicable to the specific operation type. All +operational failures are reported by setting the ``error`` field to a non-zero +value corresponding the the operation failure code, which is often a POSIX +errno value. + +File transfer requests use a ``unifyfs_transfer_request`` structure that +includes the source and destination file paths, transfer mode, and a flag +indicating whether parallel file transfer should be used. Similar to I/O +requests, the structure also contains fields used for tracking the request +status and transfer operation result. + +.. code-block:: C + :caption: File transfer request types + + /* File transfer request structure */ + typedef struct unifyfs_transfer_request { + /* user-specified fields */ + const char* src_path; + const char* dst_path; + unifyfs_transfer_mode mode; + int use_parallel; + + /* status/result fields */ + unifyfs_ioreq_state state; + unifyfs_ioreq_result result; + } unifyfs_transfer_request; + + /* enumeration of supported I/O request operations */ + typedef enum unifyfs_transfer_mode { + UNIFYFS_TRANSFER_MODE_INVALID = 0, + UNIFYFS_TRANSFER_MODE_COPY, // simple copy to destination + UNIFYFS_TRANSFER_MODE_MOVE // copy, then remove source + } unifyfs_transfer_mode; + +------------------------ +Example Client API Usage +------------------------ + +To get started using the client API, please add the following to your client +source code files that will make calls to API methods. You will also need to +modify your client application build process to link with the +``libunifyfs_api`` library. + +.. code-block:: C + :caption: Including the API header + + #include + +The common pattern for using the client API is to initialize a UnifyFS file +system handle, perform a number of operations using that handle, and then +release the handle. As previously mentioned, the same client process may +initialize multiple file system handles and use them concurrently, either +to work with multiple namespaces, or to use different configured behaviors +with different handles sharing the same namespace. + +File System Handle Initialization and Finalization +************************************************** + +To initialize a handle to UnifyFS, the client application uses the +``unifyfs_initialize()`` method as shown below. This method takes the namespace +mountpoint prefix and an array of optional configuration parameter settings as +input parameters, and initializes the value of the passed file system handle +upon success. + +In the example below, the ``logio.chunk_size`` configuration +parameter, which controls the size of the log-based I/O data chunks, is set to +the value of 32768. See :doc:`configuration` +for further options for customizing the behavior of UnifyFS. + +.. code-block:: C + :caption: UnifyFS handle initialization + + int n_configs = 1; + unifyfs_cfg_option chk_size = { .opt_name = "logio.chunk_size", + .opt_value = "32768" }; + + const char* unifyfs_prefix = "/my/unifyfs/namespace"; + unifyfs_handle fshdl = UNIFYFS_INVALID_HANDLE; + int rc = unifyfs_initialize(unifyfs_prefix, &chk_size, n_configs, &fshdl); + +Once all UnifyFS operation using the handle have been completed, the client +application should call ``unifyfs_finalize()`` as shown below to release the +resources associated with the handle. + +.. code-block:: C + :caption: UnifyFS handle finalization + + int rc = unifyfs_finalize(fshdl); + +File Creation, Use, and Removal +******************************* + +New files should be created by a single client process using ``unifyfs_create()`` +as shown below. Note that if multiple clients attempt to create the same file, +only one will succeed. + +.. note:: + Currently, the ``create_flags`` parameter is unused; it + is reserved for future use to indicate file-specific UnifyFS behavior. + +.. code-block:: C + :caption: UnifyFS file creation + + const char* filename = "/my/unifyfs/namespace/a/new/file"; + int create_flags = 0; + unifyfs_gfid gfid = UNIFYFS_INVALID_GFID; + int rc = unifyfs_create(fshdl, create_flags, filename, &gfid); + +Existing files can be opened by any client process using ``unifyfs_open()``. + +.. code-block:: C + :caption: UnifyFS file use + + const char* filename = "/my/unifyfs/namespace/an/existing/file"; + unifyfs_gfid gfid = UNIFYFS_INVALID_GFID; + int rc = unifyfs_open(fshdl, filename, &gfid); + +When no longer required, files can be deleted using ``unifyfs_remove()``. + +.. code-block:: C + :caption: UnifyFS file removal + + const char* filename = "/my/unifyfs/namespace/an/existing/file"; + int rc = unifyfs_remove(fshdl, filename); + +Batched File I/O +**************** + +File I/O operations in the client API use a batched request interface similar +to POSIX ``lio_listio()``. A client application dispatches an array of I/O +operation requests, where each request identifies the target file gfid, the +operation type (e.g., read, write, or truncate), and associated operation +parameters. Upon successful dispatch, the operations will be executed by +UnifyFS in an asynchronous manner that allows the client to overlap other +computation with I/O. The client application must then explicitly wait for +completion of the requests in the batch. After an individual request has been +completed (or canceled by the client), the request's operation results +can be queried. + +When dispatching a set of requests that target the same file, there is an order +imposed on the types of operations. First, all read operations are processed, +followed by writes, then truncations, and finally synchronization operations. +Note that this means a read request will not observe any data written in the +same batch. + +A simple use case for batched I/O is shown below, where the client dispatches +a batch of requests including several rank-strided write operations followed by +a metadata sync to make those writes visible to other clients, and then +immediately waits for completion of the entire batch. + +.. code-block:: C + :caption: Synchronous Batched I/O + + /* write and sync file metadata */ + size_t n_chks = 10; + size_t chunk_size = 1048576; + size_t block_size = chunk_size * total_ranks; + size_t n_reqs = n_chks + 1; + unifyfs_io_request my_reqs[n_reqs]; + for (size_t i = 0; i < n_chks; i++) { + my_reqs[i].op = UNIFYFS_IOREQ_OP_WRITE; + my_reqs[i].gfid = gfid; + my_reqs[i].nbytes = chunk_size; + my_reqs[i].offset = (off_t)((i * block_size) + (my_rank * chunk_size)); + my_reqs[i].user_buf = my_databuf + (i * chksize); + } + my_reqs[n_chks].op = UNIFYFS_IOREQ_OP_SYNC_META; + my_reqs[n_chks].gfid = gfid; + + rc = unifyfs_dispatch_io(fshdl, n_reqs, my_reqs); + if (rc == UNIFYFS_SUCCESS) { + int waitall = 1; + rc = unifyfs_wait_io(fshdl, n_reqs, my_reqs, waitall); + if (rc == UNIFYFS_SUCCESS) { + for (size_t i = 0; i < n_reqs; i++) { + assert(my_reqs[i].result.error == 0); + } + } + } + +Batched File Transfers +********************** + +File transfer operations in the client API also use a batched request +interface. A client application dispatches an array of file transfer +requests, where each request identifies the source and destination file +paths and the transfer mode. Two transfer modes are currently supported: + 1. COPY - Copy source file to destination path. + 2. MOVE - Copy source file to destination path, then remove source file. + +Upon successful dispatch, the transfer operations will be executed by +UnifyFS in an asynchronous manner that allows the client to overlap other +computation with I/O. The client application must then explicitly wait for +completion of the requests in the batch. After an individual request has been +completed (or canceled by the client), the request's operation results +can be queried. + +A simple use case for batched transfer is shown below, where the client +dispatches a batch of requests and then immediately waits for completion of +the entire batch. + +.. code-block:: C + :caption: Synchronous Batched File Transfers + + /* move output files from UnifyFS to parallel file system */ + const char* destfs_prefix = "/some/parallel/filesystem/location"; + size_t n_files = 3; + unifyfs_transfer_request my_reqs[n_files]; + char src_file[PATHLEN_MAX]; + char dst_file[PATHLEN_MAX]; + for (int i = 0; i < (int)n_files; i++) { + snprintf(src_file, sizeof(src_file), "%s/file.%d", unifyfs_prefix, i); + snprintf(dst_file, sizeof(src_file), "%s/file.%d", destfs_prefix, i); + my_reqs[i].src_path = strdup(src_file); + my_reqs[i].dst_path = strdup(dst_file); + my_reqs[i].mode = UNIFYFS_TRANSFER_MODE_MOVE; + my_reqs[i].use_parallel = 1; + } + + rc = unifyfs_dispatch_transfer(fshdl, n_files, my_reqs); + if (rc == UNIFYFS_SUCCESS) { + int waitall = 1; + rc = unifyfs_wait_transfer(fshdl, n_files, my_reqs, waitall); + if (rc == UNIFYFS_SUCCESS) { + for (int i = 0; i < (int)n_files; i++) { + assert(my_reqs[i].result.error == 0); + } + } + } + +More Examples +************* + +Additional examples demonstrating use of the client API can be found in +the unit tests (see api-unit-tests_). + +.. explicit external hyperlink targets + +.. _api-unit-tests: https://github.com/LLNL/UnifyFS/blob/dev/t/api \ No newline at end of file diff --git a/scripts/checkpatch.sh b/scripts/checkpatch.sh index ded03b541..10086a3c3 100755 --- a/scripts/checkpatch.sh +++ b/scripts/checkpatch.sh @@ -25,8 +25,9 @@ checkpatch_ignore+=",ARRAY_SIZE" # Don't require use of ARRAY_SIZE macro checkpatch_ignore+=",USE_NEGATIVE_ERRNO" # We don't return negative errnos checkpatch_ignore+=",NEW_TYPEDEFS" checkpatch_ignore+=",ENOSYS" -checkpatch_ignore+=",CONSTANT_COMPARISON" # Allow consts on left: if(COSNT==val) +checkpatch_ignore+=",CONSTANT_COMPARISON" # Allow consts on left: if(CONST==val) checkpatch_ignore+=",VOLATILE" # Allow use of volatile keyword +checkpatch_ignore+=",SYMBOLIC_PERMS" # Allow symbolic perms (S_IRUSR) checkpatch_cmd+=" --ignore $checkpatch_ignore" diff --git a/server/src/unifyfs_fops_mdhim.c b/server/src/unifyfs_fops_mdhim.c index 7e19970d8..00bcdbcb5 100644 --- a/server/src/unifyfs_fops_mdhim.c +++ b/server/src/unifyfs_fops_mdhim.c @@ -259,11 +259,11 @@ static int mdhim_fsync(unifyfs_fops_ctx_t* ctx, int gfid) int count = 0; for (i = 0; i < extent_num_entries; i++) { /* get file offset, length, and log offset for this entry */ - unifyfs_index_t* meta = &meta_payload[i]; - assert(gfid == meta->gfid); - size_t offset = meta->file_pos; - size_t length = meta->length; - size_t logpos = meta->log_pos; + unifyfs_index_t* ndx = &meta_payload[i]; + assert(gfid == ndx->gfid); + size_t offset = ndx->file_pos; + size_t length = ndx->length; + size_t logpos = ndx->log_pos; /* split this entry at the offset boundaries */ int used = split_index( diff --git a/server/src/unifyfs_fops_rpc.c b/server/src/unifyfs_fops_rpc.c index d9fe69784..9eb0e1811 100644 --- a/server/src/unifyfs_fops_rpc.c +++ b/server/src/unifyfs_fops_rpc.c @@ -12,6 +12,7 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ +#include "margo_server.h" #include "unifyfs_inode_tree.h" #include "unifyfs_inode.h" #include "unifyfs_group_rpc.h" @@ -301,7 +302,9 @@ int submit_read_request(unifyfs_fops_ctx_t* ctx, } else { LOGDBG("extent(gfid=%d, offset=%lu, len=%lu) has no data", ext->gfid, ext->offset, ext->length); - ret = ENODATA; + invoke_client_mread_req_complete_rpc(app_id, client_id, + client_mread, extent_ndx, + ENODATA); } } diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index 28e62ee12..072a3ad8e 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -221,6 +221,9 @@ static int release_read_req(reqmgr_thrd_t* thrd_ctrl, } memset((void*)rdreq, 0, sizeof(server_read_req_t)); thrd_ctrl->num_read_reqs--; + if (0 == thrd_ctrl->num_read_reqs) { + thrd_ctrl->next_rdreq_ndx = 0; + } LOGDBG("after release (active=%d, next=%d)", thrd_ctrl->num_read_reqs, thrd_ctrl->next_rdreq_ndx); RM_REQ_UNLOCK(thrd_ctrl); diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index 95386f88e..672763391 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -264,7 +264,7 @@ int main(int argc, char* argv[]) char rank_str[16] = {0}; char dbg_fname[UNIFYFS_MAX_FILENAME] = {0}; - rc = unifyfs_config_init(&server_cfg, argc, argv); + rc = unifyfs_config_init(&server_cfg, argc, argv, 0, NULL); if (rc != 0) { exit(1); } diff --git a/t/0700-unifyfs-stage-full.t b/t/0700-unifyfs-stage-full.t index de8540ba6..8db4f25b7 100755 --- a/t/0700-unifyfs-stage-full.t +++ b/t/0700-unifyfs-stage-full.t @@ -14,51 +14,64 @@ test_expect_success "testing temp dir exists" ' test_path_is_dir ${UNIFYFS_TEST_TMPDIR} ' -mkdir -p ${UNIFYFS_TEST_TMPDIR}/config_0700 -mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_source -mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700 +stage_cfg_dir=${UNIFYFS_TEST_TMPDIR}/stage/config_0700 +stage_src_dir=${UNIFYFS_TEST_TMPDIR}/stage/source +stage_dst_dir=${UNIFYFS_TEST_TMPDIR}/stage/destination_0700 +mkdir -p $stage_cfg_dir $stage_src_dir $stage_dst_dir test_expect_success "stage testing dirs exist" ' - test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/config_0700 - test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_source - test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700 + test_path_is_dir $stage_cfg_dir && + test_path_is_dir $stage_src_dir && + test_path_is_dir $stage_dst_dir ' -dd if=/dev/urandom bs=4M count=1 of=${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file +stage_src_file=$stage_src_dir/source_0700.file +stage_im_file=$UNIFYFS_TEST_MOUNT/intermediate_0700.file +stage_dst_file=$stage_dst_dir/destination_0700.file + +dd if=/dev/urandom bs=4M count=1 of=$stage_src_file &>/dev/null test_expect_success "source.file exists" ' - test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file + test_path_is_file $stage_src_file ' -rm -f ${UNIFYFS_TEST_TMPDIR}/config_0700/* -rm -f ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/* +rm -f $stage_cfg_dir/* $stage_dst_dir/* test_expect_success "config_0700 directory is empty" ' - test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/config_0700 + test_dir_is_empty $stage_cfg_dir ' -echo "\"${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file\" \"${UNIFYFS_TEST_MOUNT}/intermediate.file\"" > ${UNIFYFS_TEST_TMPDIR}/config_0700/test_IN.manifest -echo "\"${UNIFYFS_TEST_MOUNT}/intermediate.file\" \"${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/destination_0700.file\"" > ${UNIFYFS_TEST_TMPDIR}/config_0700/test_OUT.manifest +stage_in_manifest=$stage_cfg_dir/stage_IN.manifest +stage_out_manifest=$stage_cfg_dir/stage_OUT.manifest + +echo "\"$stage_src_file\" \"$stage_im_file\"" > $stage_in_manifest +echo "\"$stage_im_file\" \"$stage_dst_file\"" > $stage_out_manifest test_expect_success "config_0700 directory now has manifest files" ' - test_path_is_file ${UNIFYFS_TEST_TMPDIR}/config_0700/test_IN.manifest - test_path_is_file ${UNIFYFS_TEST_TMPDIR}/config_0700/test_OUT.manifest + test_path_is_file $stage_in_manifest && + test_path_is_file $stage_out_manifest ' test_expect_success "target directory is empty" ' - test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700 + test_dir_is_empty $stage_dst_dir ' -$JOB_RUN_COMMAND ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage -m ${UNIFYFS_TEST_MOUNT} ${UNIFYFS_TEST_TMPDIR}/config_0700/test_IN.manifest > ${UNIFYFS_TEST_TMPDIR}/config_0700/stage_IN_output.OUT 2>&1 +stage_in_log=$stage_cfg_dir/stage_IN.log +stage_out_log=$stage_cfg_dir/stage_OUT.log +stage_exe=${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage -$JOB_RUN_COMMAND ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage -m ${UNIFYFS_TEST_MOUNT} ${UNIFYFS_TEST_TMPDIR}/config_0700/test_OUT.manifest > ${UNIFYFS_TEST_TMPDIR}/config_0700/stage_OUT_output.OUT 2>&1 +$JOB_RUN_COMMAND $stage_exe -v -m ${UNIFYFS_TEST_MOUNT} $stage_in_manifest &> $stage_in_log + +$JOB_RUN_COMMAND $stage_exe -v -m ${UNIFYFS_TEST_MOUNT} $stage_out_manifest &> $stage_out_log test_expect_success "input file has been staged to output" ' - test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/destination_0700.file + test_path_is_file $stage_dst_file ' +export TEST_CMP='cmp --quiet' + test_expect_success "final output is identical to initial input" ' - test_might_fail test_cmp ${UNIFYFS_TEST_TMPDIR}/stage_source/source_0700.file ${UNIFYFS_TEST_TMPDIR}/stage_destination_0700/destination_0700.file + test_might_fail test_cmp $stage_src_file $stage_dst_file ' test_done diff --git a/t/8000-client-api.t b/t/8000-client-api.t new file mode 100755 index 000000000..c7c56bf6e --- /dev/null +++ b/t/8000-client-api.t @@ -0,0 +1,8 @@ +#!/bin/bash +# +# Source sharness environment scripts to pick up test environment +# and UnifyFS runtime settings. +# +. $(dirname $0)/sharness.d/00-test-env.sh +. $(dirname $0)/sharness.d/01-unifyfs-settings.sh +$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/api/client_api_test.t diff --git a/t/9300-unifyfs-stage-isolated.t b/t/9300-unifyfs-stage-isolated.t index 007853999..81cac7d0b 100755 --- a/t/9300-unifyfs-stage-isolated.t +++ b/t/9300-unifyfs-stage-isolated.t @@ -14,49 +14,57 @@ test_expect_success "testing temp dir exists" ' test_path_is_dir ${UNIFYFS_TEST_TMPDIR} ' -mkdir -p ${UNIFYFS_TEST_TMPDIR}/config_9300 -mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_source -mkdir -p ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300 +stage_cfg_dir=${UNIFYFS_TEST_TMPDIR}/stage/config_9300 +stage_src_dir=${UNIFYFS_TEST_TMPDIR}/stage/source +stage_dst_dir=${UNIFYFS_TEST_TMPDIR}/stage/destination_9300 +mkdir -p $stage_cfg_dir $stage_src_dir $stage_dst_dir test_expect_success "stage testing dirs exist" ' - test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/config_9300 - test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_source - test_path_is_dir ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300 + test_path_is_dir $stage_cfg_dir && + test_path_is_dir $stage_src_dir && + test_path_is_dir $stage_dst_dir ' -# NOTE: we're using the unifyfs-stage binary as its own transfer data target -# because we know it's there and it's filled with non-zero data. -cp ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage ${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file +stage_src_file=$stage_src_dir/source_9300.file +stage_dst_file=$stage_dst_dir/destination_9300.file -test_expect_success "source.file exists" ' - test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file +rm -f $stage_cfg_dir/* $stage_dst_dir/* + +test_expect_success "config_9300 directory is empty" ' + test_dir_is_empty $stage_cfg_dir ' -rm -f ${UNIFYFS_TEST_TMPDIR}/config_9300/* -rm -f ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/* +# NOTE: we're using the unifyfs-stage binary as its own transfer data target +# because we know it's there and it's filled with non-zero data. +stage_exe=${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage +cp $stage_exe $stage_src_file -test_expect_success "config_9300 directory is empty" ' - test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/config_9300 +test_expect_success "source.file exists" ' + test_path_is_file $stage_src_file ' -echo "\"${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file\" \"${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/destination_9300.file\"" > ${UNIFYFS_TEST_TMPDIR}/config_9300/test_INOUT.manifest +stage_manifest=$stage_cfg_dir/stage.manifest +echo "\"$stage_src_file\" \"$stage_dst_file\"" > $stage_manifest -test_expect_success "config_9300 directory now has manifest files" ' - test_path_is_file ${UNIFYFS_TEST_TMPDIR}/config_9300/test_INOUT.manifest +test_expect_success "config_9300 directory now has manifest file" ' + test_path_is_file $stage_manifest ' test_expect_success "target directory is empty" ' - test_dir_is_empty ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300 + test_dir_is_empty $stage_dst_dir ' -$JOB_RUN_COMMAND ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage -N ${UNIFYFS_TEST_TMPDIR}/config_9300/test_INOUT.manifest > ${UNIFYFS_TEST_TMPDIR}/config_9300/stage_INOUT_output.OUT 2>&1 +stage_log=$stage_cfg_dir/stage.log +$JOB_RUN_COMMAND $stage_exe -N $stage_manifest &> $stage_log test_expect_success "input file has been staged to output" ' - test_path_is_file ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/destination_9300.file + test_path_is_file $stage_dst_file ' +export TEST_CMP='cmp --quiet' + test_expect_success "final output is identical to initial input" ' - test_cmp ${UNIFYFS_TEST_TMPDIR}/stage_source/source_9300.file ${UNIFYFS_TEST_TMPDIR}/stage_destination_9300/destination_9300.file + test_cmp $stage_src_file $stage_dst_file ' test_done diff --git a/t/Makefile.am b/t/Makefile.am index 0a4a96695..674b4755e 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -19,6 +19,7 @@ TESTS += \ 0510-statfs-static.t \ 0600-stdio-static.t \ 0700-unifyfs-stage-full.t \ + 8000-client-api.t \ 9005-unifyfs-unmount.t \ 9010-stop-unifyfsd.t \ 9020-mountpoint-empty.t \ @@ -33,7 +34,8 @@ check_SCRIPTS = \ 0510-statfs-static.t \ 0600-stdio-static.t \ 0700-unifyfs-stage-full.t \ - 9005-unifyfs-unmount.t \ + 8000-client-api.t \ + 9005-unifyfs-unmount.t \ 9010-stop-unifyfsd.t \ 9020-mountpoint-empty.t \ 9200-seg-tree-test.t \ @@ -60,6 +62,7 @@ clean-local: rm -fr trash-directory.* test-results *.log test_run_env.sh libexec_PROGRAMS = \ + api/client_api_test.t \ common/seg_tree_test.t \ common/slotmap_test.t \ std/stdio-static.t \ @@ -74,6 +77,17 @@ if HAVE_GOTCHA sys/sysio-gotcha.t endif +test_api_ldadd = \ + $(top_builddir)/t/lib/libtap.la \ + $(top_builddir)/t/lib/libtestutil.la \ + $(top_builddir)/client/src/libunifyfs_api.la + +test_api_ldflags = \ + $(AM_LDFLAGS) \ + $(MPI_CLDFLAGS) \ + $(CP_WRAPPERS) \ + -static + test_common_ldadd = \ $(top_builddir)/t/lib/libtap.la \ $(top_builddir)/t/lib/libtestutil.la @@ -117,6 +131,18 @@ test_cppflags = \ $(MPI_CFLAGS) +api_client_api_test_t_SOURCES = \ + api/client_api_suite.h \ + api/client_api_suite.c \ + api/init-fini.c \ + api/create-open-remove.c \ + api/write-read-sync-stat.c \ + api/laminate.c + +api_client_api_test_t_CPPFLAGS = $(test_cppflags) +api_client_api_test_t_LDADD = $(test_api_ldadd) +api_client_api_test_t_LDFLAGS = $(test_api_ldflags) + sys_sysio_gotcha_t_SOURCES = \ sys/sysio_suite.h \ sys/sysio_suite.c \ diff --git a/t/api/client_api_suite.c b/t/api/client_api_suite.c new file mode 100644 index 000000000..ab65a318e --- /dev/null +++ b/t/api/client_api_suite.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +//#include +#include "client_api_suite.h" + +/* This is the collection of client API tests. + * + * To add new subtests to existing API functionality tests: + * 1. Simply add the tests (order matters) to the appropriate + * t/api/.c file. + * + * When new API functionality needs to be tested: + * 1. Create a t/api/.c source file with a function called: + * api__test(char *unifyfs_root) + * to contain all the TAP tests for that API functionality. + * 2. Add the function name to t/api/client_api_suite.h, with comments. + * 3. In t/Makefile.am, add the new file to the source file list for + * the api test suite (api_client_api_test_t_SOURCES). + * 4. The api__test function can now be used in this suite. */ + +int main(int argc, char* argv[]) +{ + int rc; + char* unifyfs_root = testutil_get_mount_point(); + + unifyfs_handle fshdl; + + //MPI_Init(&argc, &argv); + + plan(NO_PLAN); + + /* Add tests for new functionality below in the order desired for testing. + * + * *** NOTE *** + * The order of the tests does matter as some subsequent tests use + * functionality or files that were already tested. + */ + + rc = api_initialize_test(unifyfs_root, &fshdl); + if (rc == UNIFYFS_SUCCESS) { + api_create_open_remove_test(unifyfs_root, &fshdl); + + api_write_read_sync_stat_test(unifyfs_root, &fshdl, + (size_t)64 * KIB, (size_t)4 * KIB); + api_write_read_sync_stat_test(unifyfs_root, &fshdl, + (size_t)1 * MIB, (size_t)32 * KIB); + api_write_read_sync_stat_test(unifyfs_root, &fshdl, + (size_t)4 * MIB, (size_t)128 * KIB); + + api_laminate_test(unifyfs_root, &fshdl); + + api_finalize_test(unifyfs_root, &fshdl); + } + + //MPI_Finalize(); + + done_testing(); + + return 0; +} diff --git a/t/api/client_api_suite.h b/t/api/client_api_suite.h new file mode 100644 index 000000000..b02f04ac7 --- /dev/null +++ b/t/api/client_api_suite.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +/* This is the collection of client API tests. + * + * When new API functionality needs to be tested: + * 1. Create a t/api/.c file with a function called: + * api__test(char *unifyfs_root) + * to contain all the TAP tests for that API functionality. + * 2. Add the function name to this file, with comments. + * 3. In t/Makefile.am, add the new file to the source file list for + * the api test suite (api_client_api_test_t_SOURCES). + * 4. The api__test function can now be called from the suite's + * implementation in t/api/client_api_suite.c */ + +#ifndef T_CLIENT_API_SUITE_H +#define T_CLIENT_API_SUITE_H + +#include "t/lib/tap.h" +#include "t/lib/testutil.h" +#include "unifyfs_api.h" + +/* Tests API initialization */ +int api_initialize_test(char* unifyfs_root, + unifyfs_handle* fshdl); + +/* Tests API finalization */ +int api_finalize_test(char* unifyfs_root, + unifyfs_handle* fshdl); + +/* Tests file creation, open, and removal */ +int api_create_open_remove_test(char* unifyfs_root, + unifyfs_handle* fshdl); + +/* Tests file write, read, sync, and stat */ +int api_write_read_sync_stat_test(char* unifyfs_root, + unifyfs_handle* fshdl, + size_t filesize, + size_t chksize); + +/* Tests file laminate, with subsequent write/read/stat */ +int api_laminate_test(char* unifyfs_root, + unifyfs_handle* fshdl); + +#endif /* T_CLIENT_API_SUITE_H */ diff --git a/t/api/create-open-remove.c b/t/api/create-open-remove.c new file mode 100644 index 000000000..8d201a48f --- /dev/null +++ b/t/api/create-open-remove.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "client_api_suite.h" + +int api_create_open_remove_test(char* unifyfs_root, + unifyfs_handle* fshdl) +{ + /* Create a random file names at the mountpoint path to test */ + char testfile1[64]; + char testfile2[64]; + testutil_rand_path(testfile1, sizeof(testfile1), unifyfs_root); + testutil_rand_path(testfile2, sizeof(testfile2), unifyfs_root); + + //------------- + + diag("Starting API create tests"); + + int t1_flags = 0; + int t2_flags = 0; + unifyfs_gfid t1_gfid = UNIFYFS_INVALID_GFID; + unifyfs_gfid t2_gfid = UNIFYFS_INVALID_GFID; + + int rc = unifyfs_create(*fshdl, t1_flags, testfile1, &t1_gfid); + ok(rc == UNIFYFS_SUCCESS && t1_gfid != UNIFYFS_INVALID_GFID, + "%s:%d unifyfs_create(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_create(*fshdl, t2_flags, testfile2, &t2_gfid); + ok(rc == UNIFYFS_SUCCESS && t2_gfid != UNIFYFS_INVALID_GFID, + "%s:%d unifyfs_create(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + diag("Finished API create tests"); + + //------------- + + diag("Starting API open tests"); + + unifyfs_gfid t3_gfid = UNIFYFS_INVALID_GFID; + unifyfs_gfid t4_gfid = UNIFYFS_INVALID_GFID; + + rc = unifyfs_open(*fshdl, testfile1, &t3_gfid); + ok(rc == UNIFYFS_SUCCESS && t3_gfid == t1_gfid, + "%s:%d unifyfs_open(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_open(*fshdl, testfile2, &t4_gfid); + ok(rc == UNIFYFS_SUCCESS && t4_gfid == t2_gfid, + "%s:%d unifyfs_open(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + diag("Finished API open tests"); + + //------------- + + diag("Starting API remove tests"); + + rc = unifyfs_remove(*fshdl, testfile1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + if (UNIFYFS_SUCCESS == rc) { + unifyfs_gfid t5_gfid = UNIFYFS_INVALID_GFID; + rc = unifyfs_open(*fshdl, testfile1, &t5_gfid); + ok(rc != UNIFYFS_SUCCESS && t5_gfid == UNIFYFS_INVALID_GFID, + "%s:%d unifyfs_open(%s) after unifyfs_remove() fails: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + } + + rc = unifyfs_remove(*fshdl, testfile2); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + if (UNIFYFS_SUCCESS == rc) { + unifyfs_gfid t6_gfid = UNIFYFS_INVALID_GFID; + rc = unifyfs_open(*fshdl, testfile1, &t6_gfid); + ok(rc != UNIFYFS_SUCCESS && t6_gfid == UNIFYFS_INVALID_GFID, + "%s:%d unifyfs_open(%s) after unifyfs_remove() fails: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + } + + diag("Finished API remove tests"); + + //------------- + + return 0; +} diff --git a/t/api/init-fini.c b/t/api/init-fini.c new file mode 100644 index 000000000..ebb6e375a --- /dev/null +++ b/t/api/init-fini.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "client_api_suite.h" + +int api_initialize_test(char* unifyfs_root, + unifyfs_handle* fshdl) +{ + diag("Starting API initialization tests"); + + int n_configs = 1; + unifyfs_cfg_option chk_size = { .opt_name = "logio.chunk_size", + .opt_value = "32768" }; + + int rc = unifyfs_initialize(unifyfs_root, &chk_size, n_configs, fshdl); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_initialize() is successful: rc=%d (%s)", + __FILE__, __LINE__, rc, unifyfs_rc_enum_description(rc)); + + diag("Finished API initialization tests"); + return rc; +} + +int api_finalize_test(char* unifyfs_root, + unifyfs_handle* fshdl) +{ + diag("Starting API finalization tests"); + + int rc = unifyfs_finalize(*fshdl); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_finalize() is successful: rc=%d (%s)", + __FILE__, __LINE__, rc, unifyfs_rc_enum_description(rc)); + + diag("Finished API finalization tests"); + return rc; +} diff --git a/t/api/laminate.c b/t/api/laminate.c new file mode 100644 index 000000000..3b932f9ed --- /dev/null +++ b/t/api/laminate.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "client_api_suite.h" +#include + +/* Tests file laminate, with subsequent write/read/stat */ +int api_laminate_test(char* unifyfs_root, + unifyfs_handle* fshdl) +{ + size_t filesize = (size_t)32 * KIB; + size_t chksize = (size_t)4 * KIB; + + /* Create a random file name at the mountpoint path to test */ + char testfile[64]; + testutil_rand_path(testfile, sizeof(testfile), unifyfs_root); + + //------------- + + diag("Creating test file"); + + int flags = 0; + unifyfs_gfid gfid = UNIFYFS_INVALID_GFID; + int rc = unifyfs_create(*fshdl, flags, testfile, &gfid); + ok(rc == UNIFYFS_SUCCESS && gfid != UNIFYFS_INVALID_GFID, + "%s:%d unifyfs_create(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + //------------- + + diag("Starting API lamination tests"); + + /** + * (1) write and sync testfile (no hole) + * (2) stat testfile, should report not laminated + * (3) laminate testfile + * (4) stat testfile, should report laminated + * (5) try write, should fail + * (6) read and check file contents + */ + + size_t n_chks = filesize / chksize; + size_t extra = filesize % chksize; + if (extra) { + /* test only supports exact multiples of chunk size */ + filesize -= extra; + } + + char* databuf = malloc(filesize); + char* readbuf = malloc(filesize); + if ((NULL != databuf) && (NULL != readbuf)) { + testutil_lipsum_generate(databuf, filesize, 0); + + /* (1) write and sync testfile (no hole) */ + unifyfs_io_request fops[n_chks + 1]; + for (size_t i = 0; i < n_chks; i++) { + fops[i].op = UNIFYFS_IOREQ_OP_WRITE; + fops[i].gfid = gfid; + fops[i].nbytes = chksize; + fops[i].offset = (off_t)(i * chksize); + fops[i].user_buf = databuf + (i * chksize); + } + fops[n_chks].op = UNIFYFS_IOREQ_OP_SYNC_META; + fops[n_chks].gfid = gfid; + + rc = unifyfs_dispatch_io(*fshdl, n_chks + 1, fops); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks + 1, fops, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + /* (2) stat testfile, should report not laminated */ + unifyfs_status status; + memset(&status, 0, sizeof(status)); + rc = unifyfs_stat(*fshdl, gfid, &status); + + /* expected size=filesize since writes have been synced */ + ok((rc == UNIFYFS_SUCCESS) && + (status.global_file_size == filesize) && (status.laminated == 0), + "%s:%d unifyfs_stat(%s) is successful: filesize=%zu (expected=%zu)," + " laminated=%d (expected=0), rc=%d (%s)", + __FILE__, __LINE__, testfile, + status.global_file_size, filesize, status.laminated, + rc, unifyfs_rc_enum_description(rc)); + + /* (3) laminate testfile */ + rc = unifyfs_laminate(*fshdl, testfile); + ok((rc == UNIFYFS_SUCCESS), + "%s:%d unifyfs_laminate(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, + rc, unifyfs_rc_enum_description(rc)); + + /* (4) stat testfile again, should report laminated */ + memset(&status, 0, sizeof(status)); + rc = unifyfs_stat(*fshdl, gfid, &status); + ok((rc == UNIFYFS_SUCCESS) && (status.laminated == 1), + "%s:%d unifyfs_stat(%s) is successful: laminated=%d (expected=1)," + " rc=%d (%s)", + __FILE__, __LINE__, testfile, + status.laminated, rc, unifyfs_rc_enum_description(rc)); + + /* (5) try write, should fail */ + rc = unifyfs_dispatch_io(*fshdl, 1, fops); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, 1, fops, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + int err = fops[0].result.error; + size_t cnt = fops[0].result.count; + ok((err == EROFS) && (cnt == 0), + "%s:%d write(%s) after laminate fails: rc=%d (%s) expected EROFS", + __FILE__, __LINE__, testfile, + err, unifyfs_rc_enum_description(err)); + + /* (6) read and check full contents of all files */ + memset(readbuf, (int)'?', filesize); + unifyfs_io_request reads[n_chks]; + for (size_t i = 0; i < n_chks; i++) { + reads[i].op = UNIFYFS_IOREQ_OP_READ; + reads[i].gfid = gfid; + reads[i].nbytes = chksize; + reads[i].offset = (off_t)(i * chksize); + reads[i].user_buf = readbuf + (i * chksize); + } + + rc = unifyfs_dispatch_io(*fshdl, n_chks, reads); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks, reads, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + for (size_t i = 0; i < n_chks; i++) { + size_t bytes = reads[i].nbytes; + off_t off = reads[i].offset; + + /* check read operation status */ + err = reads[i].result.error; + cnt = reads[i].result.count; + ok((err == 0) && (cnt == bytes), + "%s:%d read(%s, offset=%zu, sz=%zu) is successful: count=%zu," + " rc=%d (%s)", __FILE__, __LINE__, testfile, (size_t)off, + bytes, cnt, err, unifyfs_rc_enum_description(err)); + + /* check valid data */ + uint64_t error_offset; + int check = testutil_lipsum_check(reads[i].user_buf, + (uint64_t)bytes, + (uint64_t)off, &error_offset); + ok(check == 0, + "%s:%d read(%s, offset=%zu, sz=%zu) data check is successful", + __FILE__, __LINE__, testfile, (size_t)off, bytes); + } + } + + diag("Finished API lamination tests"); + + //------------- + + diag("Removing test file"); + + rc = unifyfs_remove(*fshdl, testfile); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); + + //------------- + + return 0; +} diff --git a/t/api/write-read-sync-stat.c b/t/api/write-read-sync-stat.c new file mode 100644 index 000000000..fe6f42549 --- /dev/null +++ b/t/api/write-read-sync-stat.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include + +#include "client_api_suite.h" + +int api_write_read_sync_stat_test(char* unifyfs_root, + unifyfs_handle* fshdl, + size_t filesize, + size_t chksize) +{ + /* Create a random file names at the mountpoint path to test */ + char testfile1[64]; + char testfile2[64]; + char testfile3[64]; + testutil_rand_path(testfile1, sizeof(testfile1), unifyfs_root); + testutil_rand_path(testfile2, sizeof(testfile2), unifyfs_root); + testutil_rand_path(testfile3, sizeof(testfile3), unifyfs_root); + + //------------- + + diag("Creating test files"); + + int t1_flags = 0; + int t2_flags = 0; + int t3_flags = 0; + unifyfs_gfid t1_gfid = UNIFYFS_INVALID_GFID; + unifyfs_gfid t2_gfid = UNIFYFS_INVALID_GFID; + unifyfs_gfid t3_gfid = UNIFYFS_INVALID_GFID; + + int rc = unifyfs_create(*fshdl, t1_flags, testfile1, &t1_gfid); + ok((rc == UNIFYFS_SUCCESS) && (t1_gfid != UNIFYFS_INVALID_GFID), + "%s:%d unifyfs_create(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_create(*fshdl, t2_flags, testfile2, &t2_gfid); + ok((rc == UNIFYFS_SUCCESS) && (t2_gfid != UNIFYFS_INVALID_GFID), + "%s:%d unifyfs_create(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_create(*fshdl, t3_flags, testfile3, &t3_gfid); + ok((rc == UNIFYFS_SUCCESS) && (t3_gfid != UNIFYFS_INVALID_GFID), + "%s:%d unifyfs_create(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); + + //------------- + + diag("Starting API write/read/truncate/sync/stat tests"); + + /** + * Overview of test workflow: + * (1) write and sync testfile1 (no hole) + * (2) write, but don't sync, testfile2 (with hole in middle) + * (3) write, but don't sync, testfile3 (with hole at end) + * (4) stat all files, checking expected size + * (5) sync testfile2/3 + * (6) stat all files again + * (7) read and check full contents of all files + */ + + size_t n_chks = filesize / chksize; + size_t extra = filesize % chksize; + if (extra) { + /* test only supports exact multiples of chunk size */ + filesize -= extra; + } + + char* databuf = malloc(filesize); + char* readbuf = malloc(filesize); + if ((NULL != databuf) && (NULL != readbuf)) { + testutil_lipsum_generate(databuf, filesize, 0); + + /* (1) write and sync testfile1 (no hole) */ + unifyfs_io_request t1_writes[n_chks + 1]; + for (size_t i = 0; i < n_chks; i++) { + t1_writes[i].op = UNIFYFS_IOREQ_OP_WRITE; + t1_writes[i].gfid = t1_gfid; + t1_writes[i].nbytes = chksize; + t1_writes[i].offset = (off_t)(i * chksize); + t1_writes[i].user_buf = databuf + (i * chksize); + } + t1_writes[n_chks].op = UNIFYFS_IOREQ_OP_SYNC_META; + t1_writes[n_chks].gfid = t1_gfid; + + rc = unifyfs_dispatch_io(*fshdl, n_chks + 1, t1_writes); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks + 1, t1_writes, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + /* (2) write, but don't sync, testfile2 (with hole in middle) */ + unifyfs_io_request t2_writes[n_chks]; + for (size_t i = 0; i < n_chks; i++) { + if (i == (n_chks / 2)) { + /* instead of writing middle chunk, use a no-op to + * leave a hole in the middle of the file */ + t2_writes[i].op = UNIFYFS_IOREQ_NOP; + } else { + t2_writes[i].op = UNIFYFS_IOREQ_OP_WRITE; + t2_writes[i].gfid = t2_gfid; + t2_writes[i].nbytes = chksize; + t2_writes[i].offset = (off_t)(i * chksize); + t2_writes[i].user_buf = databuf + (i * chksize); + } + } + + rc = unifyfs_dispatch_io(*fshdl, n_chks, t2_writes); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks, t2_writes, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + /* (3) write, but don't sync, testfile3 (with hole at end) */ + unifyfs_io_request t3_writes[n_chks]; + for (size_t i = 0; i < n_chks; i++) { + if (i == (n_chks - 1)) { + /* instead of writing last chunk, truncate to filesize to + * leave a hole at the end of the file */ + t3_writes[i].op = UNIFYFS_IOREQ_OP_TRUNC; + t3_writes[i].gfid = t3_gfid; + t3_writes[i].offset = (off_t)(filesize); + } else { + t3_writes[i].op = UNIFYFS_IOREQ_OP_WRITE; + t3_writes[i].gfid = t3_gfid; + t3_writes[i].nbytes = chksize; + t3_writes[i].offset = (off_t)(i * chksize); + t3_writes[i].user_buf = databuf + (i * chksize); + } + } + + rc = unifyfs_dispatch_io(*fshdl, n_chks, t3_writes); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks, t3_writes, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); + + /* (4) stat all files */ + unifyfs_status t1_status, t2_status, t3_status; + + rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); + /* expected size=filesize since writes have been synced */ + ok((rc == UNIFYFS_SUCCESS) && (t1_status.global_file_size == filesize), + "%s:%d unifyfs_stat(%s) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, testfile1, + t1_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_stat(*fshdl, t2_gfid, &t2_status); + /* expected size=0 since writes have not been synced */ + ok((rc == UNIFYFS_SUCCESS) && (t2_status.global_file_size == 0), + "%s:%d unifyfs_stat(%s) is successful: filesize=%zu (expected=0)," + " rc=%d (%s)", __FILE__, __LINE__, testfile2, + t2_status.global_file_size, + rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_stat(*fshdl, t3_gfid, &t3_status); + /* expected size=filesize since truncate is a sync point */ + ok((rc == UNIFYFS_SUCCESS) && (t3_status.global_file_size == filesize), + "%s:%d unifyfs_stat(%s) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, testfile3, + t3_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + /* (5) sync testfile2/3 */ + rc = unifyfs_sync(*fshdl, t2_gfid); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_sync(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_sync(*fshdl, t3_gfid); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_sync(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); + + /* (6) stat files again */ + rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); + ok((rc == UNIFYFS_SUCCESS) && (t1_status.global_file_size == filesize), + "%s:%d unifyfs_stat(%s) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, testfile1, + t1_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_stat(*fshdl, t2_gfid, &t2_status); + ok((rc == UNIFYFS_SUCCESS) && (t2_status.global_file_size == filesize), + "%s:%d unifyfs_stat(%s) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, testfile2, + t2_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_stat(*fshdl, t3_gfid, &t3_status); + ok((rc == UNIFYFS_SUCCESS) && (t3_status.global_file_size == filesize), + "%s:%d unifyfs_stat(%s) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, testfile3, + t3_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + /* (7) read and check full contents of all files */ + memset(readbuf, (int)'?', filesize); + unifyfs_io_request t1_reads[n_chks]; + for (size_t i = 0; i < n_chks; i++) { + t1_reads[i].op = UNIFYFS_IOREQ_OP_READ; + t1_reads[i].gfid = t1_gfid; + t1_reads[i].nbytes = chksize; + t1_reads[i].offset = (off_t)(i * chksize); + t1_reads[i].user_buf = readbuf + (i * chksize); + } + + rc = unifyfs_dispatch_io(*fshdl, n_chks, t1_reads); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks, t1_reads, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + for (size_t i = 0; i < n_chks; i++) { + size_t bytes = t1_reads[i].nbytes; + off_t off = t1_reads[i].offset; + + /* check read operation status */ + int err = t1_reads[i].result.error; + size_t cnt = t1_reads[i].result.count; + ok((err == 0) && (cnt == bytes), + "%s:%d read(%s, offset=%zu, sz=%zu) is successful: count=%zu," + " rc=%d (%s)", __FILE__, __LINE__, testfile1, (size_t)off, + bytes, cnt, err, unifyfs_rc_enum_description(err)); + + /* check valid data */ + uint64_t error_offset; + int check = testutil_lipsum_check(t1_reads[i].user_buf, + (uint64_t)bytes, + (uint64_t)off, &error_offset); + ok(check == 0, + "%s:%d read(%s, offset=%zu, sz=%zu) data check is successful", + __FILE__, __LINE__, testfile1, (size_t)off, bytes); + } + + memset(readbuf, (int)'?', filesize); + unifyfs_io_request t2_reads[n_chks]; + for (size_t i = 0; i < n_chks; i++) { + t2_reads[i].op = UNIFYFS_IOREQ_OP_READ; + t2_reads[i].gfid = t2_gfid; + t2_reads[i].nbytes = chksize; + t2_reads[i].offset = (off_t)(i * chksize); + t2_reads[i].user_buf = readbuf + (i * chksize); + } + + rc = unifyfs_dispatch_io(*fshdl, n_chks, t2_reads); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks, t2_reads, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + for (size_t i = 0; i < n_chks; i++) { + size_t bytes = t2_reads[i].nbytes; + off_t off = t2_reads[i].offset; + + /* check read operation status */ + int err = t2_reads[i].result.error; + size_t cnt = t2_reads[i].result.count; + ok((err == 0) && (cnt == bytes), + "%s:%d read(%s, offset=%zu, sz=%zu) is successful: count=%zu," + " rc=%d (%s)", __FILE__, __LINE__, testfile2, (size_t)off, + bytes, cnt, err, unifyfs_rc_enum_description(err)); + + /* check valid data */ + int check; + if (i == (n_chks / 2)) { + /* check middle chunk hole is zeroes */ + check = testutil_zero_check(t2_reads[i].user_buf, bytes); + } else { + uint64_t error_offset; + check = testutil_lipsum_check(t2_reads[i].user_buf, + (uint64_t)bytes, + (uint64_t)off, &error_offset); + } + ok(check == 0, + "%s:%d read(%s, offset=%zu, sz=%zu) data check is successful", + __FILE__, __LINE__, testfile2, (size_t)off, bytes); + } + + memset(readbuf, (int)'?', filesize); + unifyfs_io_request t3_reads[n_chks]; + for (size_t i = 0; i < n_chks; i++) { + t3_reads[i].op = UNIFYFS_IOREQ_OP_READ; + t3_reads[i].gfid = t3_gfid; + t3_reads[i].nbytes = chksize; + t3_reads[i].offset = (off_t)(i * chksize); + t3_reads[i].user_buf = readbuf + (i * chksize); + } + + rc = unifyfs_dispatch_io(*fshdl, n_chks, t3_reads); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks, t3_reads, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); + + for (size_t i = 0; i < n_chks; i++) { + size_t bytes = t3_reads[i].nbytes; + off_t off = t3_reads[i].offset; + + /* check read operation status */ + int err = t3_reads[i].result.error; + size_t cnt = t3_reads[i].result.count; + ok((err == 0) && (cnt == bytes), + "%s:%d read(%s, offset=%zu, sz=%zu) is successful: count=%zu," + " rc=%d (%s)", __FILE__, __LINE__, testfile3, (size_t)off, + bytes, cnt, err, unifyfs_rc_enum_description(err)); + + /* check valid data */ + int check; + if (i == (n_chks - 1)) { + /* check last chunk hole is zeroes */ + check = testutil_zero_check(t3_reads[i].user_buf, bytes); + } else { + uint64_t error_offset; + check = testutil_lipsum_check(t3_reads[i].user_buf, + (uint64_t)bytes, + (uint64_t)off, &error_offset); + } + ok(check == 0, + "%s:%d read(%s, offset=%zu, sz=%zu) data check is successful", + __FILE__, __LINE__, testfile3, (size_t)off, bytes); + } + } + + diag("Finished API write/read/truncate/sync/stat tests"); + + //------------- + + diag("Removing test files"); + + if (t1_gfid != UNIFYFS_INVALID_GFID) { + rc = unifyfs_remove(*fshdl, testfile1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + } + + if (t2_gfid != UNIFYFS_INVALID_GFID) { + rc = unifyfs_remove(*fshdl, testfile2); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + } + + if (t3_gfid != UNIFYFS_INVALID_GFID) { + rc = unifyfs_remove(*fshdl, testfile3); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); + } + + //------------- + + return 0; + +} diff --git a/t/lib/testutil.c b/t/lib/testutil.c index d1e884ac5..0d4e3b4c6 100644 --- a/t/lib/testutil.c +++ b/t/lib/testutil.c @@ -13,12 +13,15 @@ */ #include +#include #include #include #include #include #include +#include "testutil.h" + static unsigned long seed; /* @@ -118,3 +121,92 @@ void testutil_get_size(char* path, size_t* global) *global = sb.st_size; } } + +/* + * Sequentially number every 8 bytes (uint64_t) + */ +void testutil_lipsum_generate(char* buf, uint64_t len, uint64_t offset) +{ + uint64_t i; + uint64_t skip = 0; + uint64_t remain = 0; + uint64_t start = offset / sizeof(uint64_t); + uint64_t count = len / sizeof(uint64_t); + uint64_t* ibuf = (uint64_t*) buf; + + /* check if we have any extra bytes at the front and end */ + if (offset % sizeof(uint64_t)) { + skip = sizeof(uint64_t) - (offset % sizeof(uint64_t)); + remain = (len - skip) % sizeof(uint64_t); + + ibuf = (uint64_t*) &buf[skip]; + start++; + + if (skip + remain >= sizeof(uint64_t)) { + count--; + } + } + + for (i = 0; i < count; i++) { + ibuf[i] = start + i; + } +} + +/* + * Check buffer contains lipsum generated data. + * returns 0 on success, -1 otherwise with @error_offset set. + */ +int testutil_lipsum_check(const char* buf, uint64_t len, uint64_t offset, + uint64_t* error_offset) +{ + uint64_t i, val; + uint64_t skip = 0; + uint64_t remain = 0; + uint64_t start = offset / sizeof(uint64_t); + uint64_t count = len / sizeof(uint64_t); + const uint64_t* ibuf = (uint64_t*) buf; + + /* check if we have any extra bytes at the front and end */ + if (offset % sizeof(uint64_t)) { + skip = sizeof(uint64_t) - (offset % sizeof(uint64_t)); + remain = (len - skip) % sizeof(uint64_t); + + ibuf = (uint64_t*) &buf[skip]; + start++; + + if (skip + remain >= sizeof(uint64_t)) { + count--; + } + } + + for (i = 0; i < count; i++) { + val = start + i; + if (ibuf[i] != val) { + *error_offset = offset + (i * sizeof(uint64_t)); + fprintf(stderr, + "LIPSUM CHECK ERROR: [%" PRIu64 "] @ offset %" PRIu64 + ", expected=%" PRIu64 " found=%" PRIu64 "\n", + i, *error_offset, val, ibuf[i]); + return -1; + } + } + + return 0; +} + +/* + * Check buffer contains all zero bytes. + * returns 0 on success, -1 otherwise. + */ +int testutil_zero_check(const char* buf, size_t len) +{ + for (size_t i = 0; i < len; i++) { + if (buf[i] != 0) { + fprintf(stderr, + "ZERO CHECK ERROR: byte @ offset %zu is non-zero\n", i); + return -1; + } + } + + return 0; +} diff --git a/t/lib/testutil.h b/t/lib/testutil.h index 58b628df6..b138cba45 100644 --- a/t/lib/testutil.h +++ b/t/lib/testutil.h @@ -12,6 +12,8 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ +#include + /* * Store a random string of length @len into buffer @buf. */ @@ -35,3 +37,20 @@ char* testutil_get_mount_point(void); /* Stat the file associated to by path and store the global size of the * file at path in the address of the global pointer passed in. */ void testutil_get_size(char* path, size_t* global); + +/* Sequentially number every 8 bytes (uint64_t) in given buffer */ +void testutil_lipsum_generate(char* buf, uint64_t len, uint64_t offset); + +/* + * Check buffer contains lipsum generated data. + * returns 0 on success, -1 otherwise with @error_offset set. + */ +int testutil_lipsum_check(const char* buf, uint64_t len, uint64_t offset, + uint64_t* error_offset); + + +/* + * Check buffer contains all zero bytes. + * returns 0 on success, -1 otherwise. + */ +int testutil_zero_check(const char* buf, size_t len); diff --git a/t/sharness.d/00-test-env.sh b/t/sharness.d/00-test-env.sh index e45079461..fa91c9a9d 100644 --- a/t/sharness.d/00-test-env.sh +++ b/t/sharness.d/00-test-env.sh @@ -64,3 +64,6 @@ if [ -f /proc/sys/kernel/yama/ptrace_scope ]; then fi fi fi + +# Want core dumps? uncomment following line +#ulimit -c unlimited diff --git a/t/sharness.d/01-unifyfs-settings.sh b/t/sharness.d/01-unifyfs-settings.sh index 090064880..735240ef3 100644 --- a/t/sharness.d/01-unifyfs-settings.sh +++ b/t/sharness.d/01-unifyfs-settings.sh @@ -19,7 +19,9 @@ export UNIFYFS_RUNSTATE_DIR=${UNIFYFS_TEST_STATE} export UNIFYFS_SHAREDFS_DIR=${UNIFYFS_TEST_SHARE} # Client settings -export UNIFYFS_LOGIO_SPILL_SIZE=$((5 * (2 ** 30))) +export UNIFYFS_LOGIO_CHUNK_SIZE=$((32 * 1024)) +export UNIFYFS_LOGIO_SHMEM_SIZE=$((64 * 1048576)) +export UNIFYFS_LOGIO_SPILL_SIZE=$((512 * 1048576)) export UNIFYFS_LOGIO_SPILL_DIR=${UNIFYFS_TEST_SPILL} # In mercury 2.0.0, daemonize breaks our use of na+sm. More info diff --git a/util/unifyfs-stage/src/unifyfs-stage-transfer.c b/util/unifyfs-stage/src/unifyfs-stage-transfer.c index f70e6d3dc..2273affad 100644 --- a/util/unifyfs-stage/src/unifyfs-stage-transfer.c +++ b/util/unifyfs-stage/src/unifyfs-stage-transfer.c @@ -315,6 +315,8 @@ int unifyfs_stage_transfer(unifyfs_stage_t* ctx) ret = unifyfs_transfer_file_serial(src, dst); if (ret) { + fprintf(stderr, "[%d] failed to transfer file (err=%d)\n", + rank, -ret); goto out; } @@ -335,35 +337,22 @@ int unifyfs_stage_transfer(unifyfs_stage_t* ctx) } } } else { - if (0 == rank) { - int fd = -1; - - if (verbose) { - fprintf(stdout, "[%d] parallel transfer: src=%s, dst=%s\n", - rank, src, dst); - } - - fd = open(dst, O_WRONLY | O_CREAT | O_TRUNC, 0600); - if (fd < 0) { - fprintf(stderr, "[%d] failed to create the file %s\n", - rank, dst); - goto out; - } - close(fd); + if (verbose) { + fprintf(stdout, "[%d] parallel transfer: src=%s, dst=%s\n", + rank, src, dst); } MPI_Barrier(MPI_COMM_WORLD); ret = unifyfs_transfer_file_parallel(src, dst); if (ret) { + fprintf(stderr, "[%d] failed to transfer file (err=%d)\n", + rank, -ret); goto out; } MPI_Barrier(MPI_COMM_WORLD); - // possible lamination check or force lamination - // may need to go here - if (ctx->checksum && 0 == rank) { ret = verify_checksum(src, dst); if (ret) { From 0fa790e06f6b829b1c37389882c8e7efa2785bb0 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Fri, 14 May 2021 15:30:09 -0400 Subject: [PATCH 05/81] Include sys/sysmacros.h in testlib.h In recent Linux systems, major() and minor() are defined in sys/sysmacros.h and sys/types.h no longer includes sys/sysmacros.h. So, we need to include sys/sysmacros.h in testlib.h. Also adds a check for the existence of sys/sysmacros.h in configure.ac. Include sys/sysmacros.h in testlib.h --- configure.ac | 1 + examples/src/testlib.h | 1 + 2 files changed, 2 insertions(+) diff --git a/configure.ac b/configure.ac index 681cdb707..77719eb87 100755 --- a/configure.ac +++ b/configure.ac @@ -65,6 +65,7 @@ AC_CHECK_HEADERS([fcntl.h inttypes.h libgen.h limits.h mntent.h strings.h syslog AC_CHECK_HEADERS([wchar.h wctype.h]) AC_CHECK_HEADERS([sys/mount.h sys/socket.h sys/statfs.h sys/time.h]) AC_CHECK_HEADERS([arpa/inet.h netdb.h netinet/in.h]) +AC_CHECK_HEADER([sys/sysmacros.h], [], AC_MSG_ERROR([cannot find required header sys/sysmacros.h])) # Checks for library functions. AC_FUNC_MALLOC diff --git a/examples/src/testlib.h b/examples/src/testlib.h index 2b0a698da..41c9900a6 100644 --- a/examples/src/testlib.h +++ b/examples/src/testlib.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include From 07e1d312c40e3c1579f3da60925b2289de7e9290 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Mon, 24 May 2021 12:09:40 -0400 Subject: [PATCH 06/81] clarify that library api is not required for apps --- docs/library_api.rst | 55 ++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/docs/library_api.rst b/docs/library_api.rst index 6a0177929..8da1d80b6 100644 --- a/docs/library_api.rst +++ b/docs/library_api.rst @@ -1,23 +1,28 @@ -========================== -UnifyFS Client API Library -========================== +============================== +UnifyFS API for I/O Middleware +============================== -This section describes the purpose, concepts, and usage of the UnifyFS client -API library. +This section describes the purpose, concepts, and usage of the UnifyFS +library API. ------------------- -Client API Purpose ------------------- +------------------- +Library API Purpose +------------------- -The client API library provides a direct interface for UnifyFS configuration, +The UnifyFS library API provides a direct interface for UnifyFS configuration, namespace management, and batched file I/O and transfer operations. The library is primarily targeted for use by I/O middleware software such as HDF5 and VeloC, but is also useful for user applications needing programmatic control and interactions with UnifyFS. -------------------- -Client API Concepts -------------------- +.. note:: + Use of the library API is *not* required for most applications, as UnifyFS + will transparently intercept I/O operations made by the application. See + :doc:`examples` for examples of typical application usage. + +-------------------- +Library API Concepts +-------------------- Namespace (aka Mountpoint) ************************** @@ -27,13 +32,13 @@ used to distinguish the UnifyFS namespace from other file systems available to the client application. All absolute file paths that include the mountpoint prefix are treated as belonging to the associated UnifyFS namespace. -Using the client API, an application or I/O middleware system can operate on +Using the library API, an application or I/O middleware system can operate on multiple UnifyFS namespaces concurrently. File System Handle ****************** -All client API library methods require a file system handle parameter of type +All library API methods require a file system handle parameter of type ``unifyfs_handle``. Users obtain a valid handle via an API call to ``unifyfs_initialize()``, which specifies the mountpoint prefix and configuration settings associated with the handle. @@ -55,9 +60,9 @@ such, it is valid to obtain the gfid for a file in a single process (e.g., via file creation), and then share the resulting gfid value among other parallel processes via a collective communication mechanism. ----------------- -Client API Types ----------------- +----------------- +Library API Types +----------------- The file system handle type is a pointer to an opaque client structure that records the associated mountpoint and configuration. @@ -153,11 +158,11 @@ status and transfer operation result. UNIFYFS_TRANSFER_MODE_MOVE // copy, then remove source } unifyfs_transfer_mode; ------------------------- -Example Client API Usage ------------------------- +------------------------- +Example Library API Usage +------------------------- -To get started using the client API, please add the following to your client +To get started using the library API, please add the following to your client source code files that will make calls to API methods. You will also need to modify your client application build process to link with the ``libunifyfs_api`` library. @@ -167,7 +172,7 @@ modify your client application build process to link with the #include -The common pattern for using the client API is to initialize a UnifyFS file +The common pattern for using the library API is to initialize a UnifyFS file system handle, perform a number of operations using that handle, and then release the handle. As previously mentioned, the same client process may initialize multiple file system handles and use them concurrently, either @@ -247,7 +252,7 @@ When no longer required, files can be deleted using ``unifyfs_remove()``. Batched File I/O **************** -File I/O operations in the client API use a batched request interface similar +File I/O operations in the library API use a batched request interface similar to POSIX ``lio_listio()``. A client application dispatches an array of I/O operation requests, where each request identifies the target file gfid, the operation type (e.g., read, write, or truncate), and associated operation @@ -302,7 +307,7 @@ immediately waits for completion of the entire batch. Batched File Transfers ********************** -File transfer operations in the client API also use a batched request +File transfer operations in the library API also use a batched request interface. A client application dispatches an array of file transfer requests, where each request identifies the source and destination file paths and the transfer mode. Two transfer modes are currently supported: @@ -352,7 +357,7 @@ the entire batch. More Examples ************* -Additional examples demonstrating use of the client API can be found in +Additional examples demonstrating use of the library API can be found in the unit tests (see api-unit-tests_). .. explicit external hyperlink targets From 190ef6534a1ff97e77f86a7fcfdcf8d703f433d2 Mon Sep 17 00:00:00 2001 From: CamStan Date: Fri, 21 May 2021 13:29:05 -0700 Subject: [PATCH 07/81] Bugfix and enable unifyfs-stage use with MPI mount Fixes a bug caused by a typo when attempting to transfer files out during unifyfs terminate. Add flag when building unifyfs-stage to allows its use when using the automount feature. --- util/unifyfs-stage/src/Makefile.am | 14 ++++++++++---- util/unifyfs-stage/src/unifyfs-stage.c | 8 ++++++-- util/unifyfs-stage/src/unifyfs-stage.h | 3 +++ util/unifyfs/src/unifyfs-rm.c | 2 +- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/util/unifyfs-stage/src/Makefile.am b/util/unifyfs-stage/src/Makefile.am index 00e671a96..a62660884 100644 --- a/util/unifyfs-stage/src/Makefile.am +++ b/util/unifyfs-stage/src/Makefile.am @@ -5,10 +5,16 @@ unifyfs_stage_SOURCES = unifyfs-stage.c \ noinst_HEADERS = unifyfs-stage.h -unifyfs_stage_CPPFLAGS = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ - $(OPENSSL_CFLAGS) \ - -I$(top_srcdir)/client/src \ - -I$(top_srcdir)/common/src +stage_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ + $(OPENSSL_CFLAGS) \ + -I$(top_srcdir)/client/src \ + -I$(top_srcdir)/common/src + +if USE_PMPI_WRAPPERS +stage_cppflags += -DENABLE_MPI_MOUNT +endif + +unifyfs_stage_CPPFLAGS = $(stage_cppflags) unifyfs_stage_LDADD = $(top_builddir)/client/src/libunifyfs.la -lrt -lm diff --git a/util/unifyfs-stage/src/unifyfs-stage.c b/util/unifyfs-stage/src/unifyfs-stage.c index 169c6a325..9eb5dd6dc 100644 --- a/util/unifyfs-stage/src/unifyfs-stage.c +++ b/util/unifyfs-stage/src/unifyfs-stage.c @@ -270,6 +270,10 @@ int main(int argc, char** argv) ctx->mountpoint = mountpoint; ctx->manifest_file = manifest_file; +#if defined(ENABLE_MPI_MOUNT) + ctx->enable_mpi_mount = 1; +#endif + if (verbose) { unifyfs_stage_print(ctx); } @@ -278,7 +282,7 @@ int main(int argc, char** argv) debug_pause(rank, "About to mount unifyfs.. "); } - if (should_we_mount_unifyfs) { + if (should_we_mount_unifyfs && !ctx->enable_mpi_mount) { ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); if (ret) { fprintf(stderr, "failed to mount unifyfs at %s (%s)", @@ -305,7 +309,7 @@ int main(int argc, char** argv) } } - if (should_we_mount_unifyfs) { + if (should_we_mount_unifyfs && !ctx->enable_mpi_mount) { ret = unifyfs_unmount(); if (ret) { fprintf(stderr, "unmounting unifyfs failed (ret=%d)\n", ret); diff --git a/util/unifyfs-stage/src/unifyfs-stage.h b/util/unifyfs-stage/src/unifyfs-stage.h index d9a12697c..229ad9648 100644 --- a/util/unifyfs-stage/src/unifyfs-stage.h +++ b/util/unifyfs-stage/src/unifyfs-stage.h @@ -23,6 +23,7 @@ struct _unifyfs_stage { int checksum; /* perform checksum? 0:no, 1:yes */ int mode; /* transfer mode? 0:serial, 1:parallel */ int should_we_mount_unifyfs; /* mount? 0:no (for testing), 1: yes */ + int enable_mpi_mount; /* automount during MPI_Init() */ char* mountpoint; /* unifyfs mountpoint */ char* manifest_file; /* manifest file containing the transfer list */ }; @@ -37,6 +38,7 @@ static inline void unifyfs_stage_print(unifyfs_stage_t* ctx) "checksum = %d\n" "mode = %d\n" "should_we_mount_unifyfs = %d\n" + "mpi_mount = %d\n" "mountpoint = %s\n" "manifest file = %s\n", ctx->rank, @@ -44,6 +46,7 @@ static inline void unifyfs_stage_print(unifyfs_stage_t* ctx) ctx->checksum, ctx->mode, ctx->should_we_mount_unifyfs, + ctx->enable_mpi_mount, ctx->mountpoint, ctx->manifest_file); } diff --git a/util/unifyfs/src/unifyfs-rm.c b/util/unifyfs/src/unifyfs-rm.c index 327840be0..6301c8890 100644 --- a/util/unifyfs/src/unifyfs-rm.c +++ b/util/unifyfs/src/unifyfs-rm.c @@ -899,7 +899,7 @@ static int jsrun_stage(unifyfs_resource_t* resource, // full command: jsrun snprintf(cmd, sizeof(cmd), - "jsrun --immediate -e -individual --stdio_stderr unifyfs-stage.err.%%h.%%p --stdio_stdout unifyfs-stage.out.%%h.%%p --nrs %zu -r1 -c1 -a1", + "jsrun --immediate -e individual --stdio_stderr unifyfs-stage.err.%%h.%%p --stdio_stdout unifyfs-stage.out.%%h.%%p --nrs %zu -r1 -c1 -a1", resource->n_nodes); generic_stage(cmd, jsrun_argc, args); From 9807fdaae268152280cba7f9c94591bb2378730a Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Fri, 4 Jun 2021 15:56:53 -0400 Subject: [PATCH 08/81] allow variable logio header size --- common/src/unifyfs_logio.c | 105 +++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 27 deletions(-) diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c index 0c33e7369..afcd6f9b7 100644 --- a/common/src/unifyfs_logio.c +++ b/common/src/unifyfs_logio.c @@ -12,13 +12,15 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ +#define _GNU_SOURCE /* for Linux mremap() */ +#include + #include #include #include #include #include #include -#include #include #include "unifyfs_log.h" @@ -33,11 +35,12 @@ /* log-based I/O header - first page of shmem region or spill file */ typedef struct log_header { + size_t hdr_sz; /* total header bytes (struct and chunk_map) */ size_t data_sz; /* total data bytes in log */ size_t reserved_sz; /* reserved data bytes */ size_t chunk_sz; /* data chunk size */ size_t max_reserved_slot; /* slot index for last reserved chunk */ - off_t data_offset; /* file/memory offset where data chunks start */ + off_t data_offset; /* file/memory offset where data chunks start */ } log_header; /* chunk slot_map immediately follows header and occupies rest of the page */ // slot_map chunk_map; /* chunk slot_map that tracks reservations */ @@ -133,17 +136,38 @@ static int get_spillfile(const char* path, } /* map log header (1st page) of spill file given by file descriptor */ -static void* map_spillfile(int spill_fd, int mmap_prot) +static void* map_spillfile(int spill_fd, int mmap_prot, int n_pages) { + int err; size_t pgsz = get_page_size(); + size_t mapsz = pgsz * n_pages; + LOGDBG("mapping spillfile - fd=%d, pgsz=%zu", spill_fd, pgsz); - void* addr = mmap(NULL, pgsz, mmap_prot, MAP_SHARED, spill_fd, 0); + errno = 0; + void* addr = mmap(NULL, mapsz, mmap_prot, MAP_SHARED, spill_fd, 0); + err = errno; if (MAP_FAILED == addr) { - int err = errno; LOGERR("mmap(fd=%d, sz=%zu, MAP_SHARED) failed - %s", - spill_fd, pgsz, strerror(err)); + spill_fd, mapsz, strerror(err)); return NULL; } + + if (mmap_prot == PROT_READ) { /* server maps for read only */ + log_header* loghdr = (log_header*) addr; + size_t hdr_sz = loghdr->hdr_sz; + if (hdr_sz > mapsz) { + /* need to remap to access the entire header */ + errno = 0; + void* new_addr = mremap(addr, mapsz, hdr_sz, MREMAP_MAYMOVE); + err = errno; + if (MAP_FAILED == new_addr) { + LOGERR("mremap(old_sz=%zu, new_sz=%zu, MAYMOVE) failed - %s", + mapsz, hdr_sz, strerror(err)); + return NULL; + } + return new_addr; + } + } return addr; } @@ -182,7 +206,7 @@ int unifyfs_logio_init_server(const int app_id, return EINVAL; } - /* open the spill over file */ + /* open the spill-over file */ snprintf(spillfile, sizeof(spillfile), LOGIO_SPILL_FMTSTR, spill_dir, app_id, client_id); spill_fd = get_spillfile(spillfile, spill_size); @@ -190,9 +214,9 @@ int unifyfs_logio_init_server(const int app_id, LOGERR("Failed to open logio spill file!"); return UNIFYFS_FAILURE; } else { - /* map first page of the spill over file, which contains log header + /* map the start of the spill-over file, which contains log header * and chunk slot_map. server only needs read access. */ - spill_mapping = map_spillfile(spill_fd, PROT_READ); + spill_mapping = map_spillfile(spill_fd, PROT_READ, 1); if (NULL == spill_mapping) { LOGERR("Failed to map logio spill file header!"); return UNIFYFS_FAILURE; @@ -236,24 +260,43 @@ static int init_log_header(char* log_region, /* zero all log header fields */ memset(log_region, 0, sizeof(log_header)); - - /* chunk data starts after header page */ - size_t data_size = region_size - pgsz; - hdr->data_sz = data_size; hdr->chunk_sz = chunk_size; - hdr->data_offset = (off_t)pgsz; - /* initialize chunk slot map (immediately follows header in memory) */ + /* chunk slot map immediately follows header */ char* slotmap = log_region + sizeof(log_header); - size_t slotmap_size = pgsz - sizeof(log_header); - size_t n_chunks = data_size / chunk_size; - slot_map* chunkmap = slotmap_init(n_chunks, (void*)slotmap, slotmap_size); - if (NULL == chunkmap) { - LOGERR("Failed to initialize chunk slotmap @ %p (sz=%zu, #chunks=%zu)", - slotmap, slotmap_size, n_chunks); - return UNIFYFS_FAILURE; + + /* determine number of pages necessary to hold chunkmap */ + size_t hdr_pages = 1; + size_t hdr_size = 0; + size_t data_size = 0; + while (1) { + hdr_size = (hdr_pages * pgsz); + if (hdr_size >= region_size) { + LOGERR("Failed chunk slotmap init (region_sz=%zu, chunk_sz=%zu)", + region_size, chunk_size); + return UNIFYFS_FAILURE; + } + + /* chunk data starts after header pages */ + data_size = region_size - hdr_size; + size_t n_chunks = data_size / chunk_size; + + /* try to init chunk slotmap */ + size_t slotmap_size = hdr_size - sizeof(log_header); + slot_map* chunkmap = slotmap_init(n_chunks, slotmap, slotmap_size); + if (NULL == chunkmap) { + LOGDBG("chunk slotmap init failed (sz=%zu, #chunks=%zu)", + slotmap_size, n_chunks); + hdr_pages++; + continue; + } + break; } + hdr->hdr_sz = hdr_size; + hdr->data_sz = data_size; + hdr->data_offset = (off_t)hdr_size; + return UNIFYFS_SUCCESS; } @@ -333,7 +376,7 @@ int unifyfs_logio_init_client(const int app_id, void* spill_mapping = NULL; int spill_fd = -1; if (unifyfs_use_spillover) { - /* get directory in which to create spill over files */ + /* get directory in which to create spill-over files */ cfgval = client_cfg->logio_spill_dir; if (NULL == cfgval) { LOGERR("UNIFYFS_LOGIO_SPILL_DIR configuration not set! " @@ -341,20 +384,28 @@ int unifyfs_logio_init_client(const int app_id, return UNIFYFS_ERROR_BADCONFIG; } - /* define path to the spill over file for data chunks */ + /* define path to the spill-over file for data chunks */ char spillfile[UNIFYFS_MAX_FILENAME]; snprintf(spillfile, sizeof(spillfile), LOGIO_SPILL_FMTSTR, cfgval, app_id, client_id); - /* create the spill over file */ + /* create the spill-over file */ spill_fd = get_spillfile(spillfile, spill_size); if (spill_fd < 0) { LOGERR("Failed to open logio spill file!"); return UNIFYFS_FAILURE; } else { - /* map first page of the spill over file, which contains log header + /* estimate header size based on number of chunks */ + size_t pgsz = get_page_size(); + size_t n_chunks = spill_size / chunk_size; + size_t chunks_per_page = pgsz * 8; /* 8 chunks per map byte */ + size_t n_pages = n_chunks / chunks_per_page; + n_pages++; /* +1 to account for logio metadata */ + + /* map start of the spill-over file, which contains log header * and chunk slot_map. client needs read and write access. */ - spill_mapping = map_spillfile(spill_fd, PROT_READ|PROT_WRITE); + int map_flags = PROT_READ | PROT_WRITE; + spill_mapping = map_spillfile(spill_fd, map_flags, n_pages); if (NULL == spill_mapping) { LOGERR("Failed to map logio spill file header!"); return UNIFYFS_FAILURE; From 6f2192d4e3ddfdafb3024cfd8698b4f4b07ae204 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 13 Jun 2021 07:30:02 -0700 Subject: [PATCH 09/81] Updates to overview section of docs for clarity --- docs/overview.rst | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/docs/overview.rst b/docs/overview.rst index 3aedb6e13..8f1c549e1 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -2,25 +2,29 @@ Overview ================ -UnifyFS is a user level file system currently under active development. An -application can use node-local storage as burst buffers for shared files. -UnifyFS is designed to support both checkpoint/restart which is the most -important I/O workload for HPC and other common I/O workloads. With -UnifyFS, applications can write to fast, scalable, node-local burst buffers as -easily as they do to the parallel file system. This section provides a high -level design of UnifyFS. It describes the UnifyFS library and the UnifyFS -daemon. - -The file system that UnifyFS instantiates only exists in user space and is -only visible to applications linked against the UnifyFS client library. Since -traditional file system tools (ls, cd, etc.) are not linked against the -UnifyFS client library they cannot see nor manipulate files within UnifyFS. -Each UnifyFS file system lasts as -long as the server processes are running, which is typically as long as the -job they are running within. When the servers exit the file system is -deleted. It is the responsibility of the user to copy files that -need to be persisted from UnifyFS to a permanent file system. -UnifyFS provides an API and a utility to conduct such copies. +UnifyFS is a user-level file system under active development +that supports shared file I/O over distributed storage on HPC systems, +e.g., node-local burst buffers. +With UnifyFS, applications can write to fast, scalable, node-local burst buffers as +easily as they do to the parallel file system. +UnifyFS is designed to support common I/O workloads such as +checkpoint/restart and other bulk-synchronous I/O workloads typically +performed by HPC applications. + +Because the UnifyFS file system is implemented at user-level, the +file system is visible only to applications linked with the UnifyFS client library. +A consequence of this is that +traditional file system tools (ls, cd, etc.) installed by system administrators +cannot act on files in a UnifyFS file system because they are not linked +against the UnifyFS client library. +The lifetime of a UnifyFS file system is the duration of the execution of +the UnifyFS server processes, which is typically for the duration of an +HPC job. +When the servers exit, the UnifyFS file system terminates. +Users must copy files that need to be persisted beyond the lifetime of the +job from UnifyFS to a permanent file system. +UnifyFS provides an API and a utility to perform these copies. + --------------------------- High Level Design @@ -28,6 +32,8 @@ High Level Design .. image:: images/design-high-lvl.png +This section provides a high +level design of UnifyFS. UnifyFS presents a shared namespace (e.g., /unifyfs as a mount point) to all compute nodes in a job allocation. There are two main components of UnifyFS: the UnifyFS library and the UnifyFS server. From d241c87ef3b1a14a2fc21e632900f1daececd389 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Fri, 11 Jun 2021 10:44:44 -0400 Subject: [PATCH 10/81] build separate libraries for mpi-mount Also: - add rpath support to gotcha and spath m4 - define USE_SPATH for use in source code (HAVE_SPATH is for makefiles) - remove old sysio examples - automake target_LDFLAGS should only be used for linker options, target_LIBADD is suggested for '-L -l' additions --- client/src/Makefile.am | 58 +++- client/src/unifyfs.c | 8 +- configure.ac | 2 +- examples/src/Makefile.am | 537 +++++++++++------------------ examples/src/sysio-cp.c | 294 ---------------- examples/src/sysio-dir.c | 350 ------------------- examples/src/sysio-open.c | 243 ------------- examples/src/sysio-read.c | 434 ----------------------- examples/src/sysio-stat.c | 183 ---------- examples/src/sysio-truncate.c | 187 ---------- examples/src/sysio-unlink.c | 160 --------- examples/src/sysio-write.c | 397 --------------------- examples/src/sysio-writeread.c | 310 ----------------- examples/src/sysio-writeread2.c | 414 ---------------------- m4/gotcha.m4 | 26 +- m4/spath.m4 | 42 ++- t/Makefile.am | 195 ++++------- t/sys/chdir.c | 12 +- util/unifyfs-stage/src/Makefile.am | 37 +- 19 files changed, 413 insertions(+), 3476 deletions(-) delete mode 100644 examples/src/sysio-cp.c delete mode 100644 examples/src/sysio-dir.c delete mode 100644 examples/src/sysio-open.c delete mode 100644 examples/src/sysio-read.c delete mode 100644 examples/src/sysio-stat.c delete mode 100644 examples/src/sysio-truncate.c delete mode 100644 examples/src/sysio-unlink.c delete mode 100644 examples/src/sysio-write.c delete mode 100644 examples/src/sysio-writeread.c delete mode 100644 examples/src/sysio-writeread2.c diff --git a/client/src/Makefile.am b/client/src/Makefile.am index 2ff8951b1..a40b2b180 100644 --- a/client/src/Makefile.am +++ b/client/src/Makefile.am @@ -2,15 +2,21 @@ include $(top_srcdir)/common/src/Makefile.mk lib_LTLIBRARIES = libunifyfs.la libunifyfs_api.la -libunifyfsdir = $(includedir) +if USE_PMPI_WRAPPERS +lib_LTLIBRARIES += libunifyfs_mpi.la +endif #USE_PMPI_WRAPPERS if HAVE_GOTCHA lib_LTLIBRARIES += libunifyfs_gotcha.la -libunifyfs_gotchadir = $(includedir) + +if USE_PMPI_WRAPPERS +lib_LTLIBRARIES += libunifyfs_mpi_gotcha.la +endif #USE_PMPI_WRAPPERS if HAVE_FORTRAN lib_LTLIBRARIES += libunifyfsf.la endif #HAVE_FORTRAN + endif #HAVE_GOTCHA AM_CFLAGS = -Wall -Wno-strict-aliasing -Werror @@ -24,6 +30,9 @@ include_HEADERS += unifyfsf.h endif #HAVE_FORTRAN endif #HAVE_GOTCHA + +# Common compile/link flag definitions + CLIENT_COMMON_CPPFLAGS = \ -I$(top_builddir)/client \ -I$(top_srcdir)/client/include \ @@ -35,13 +44,18 @@ CLIENT_COMMON_CFLAGS = \ $(MPI_CFLAGS) CLIENT_COMMON_LDFLAGS = \ - -version-info $(LIBUNIFYFS_LT_VERSION) \ - $(MPI_CLDFLAGS) + -version-info $(LIBUNIFYFS_LT_VERSION) CLIENT_COMMON_LIBADD = \ $(UNIFYFS_COMMON_LIBS) \ + $(MPI_CLDFLAGS) \ -lm -lrt -lcrypto -lpthread +if HAVE_SPATH +CLIENT_COMMON_CFLAGS += $(SPATH_CFLAGS) +CLIENT_COMMON_LIBADD += $(SPATH_LIBS) +endif + CLIENT_API_SRC_FILES = \ unifyfs_api.h \ unifyfs_api_internal.h \ @@ -67,10 +81,6 @@ CLIENT_CORE_SRC_FILES = \ uthash.h \ utlist.h -if USE_PMPI_WRAPPERS -CLIENT_CORE_SRC_FILES += pmpi_wrappers.c pmpi_wrappers.h -endif - CLIENT_POSIX_SRC_FILES = \ unifyfs-dirops.c \ unifyfs-dirops.h \ @@ -79,6 +89,13 @@ CLIENT_POSIX_SRC_FILES = \ unifyfs-sysio.c \ unifyfs-sysio.h +PMPI_SRC_FILES = \ + pmpi_wrappers.c \ + pmpi_wrappers.h + + +# Per-target flags begin here + libunifyfs_api_la_SOURCES = \ $(CLIENT_API_SRC_FILES) \ $(CLIENT_CORE_SRC_FILES) \ @@ -96,6 +113,15 @@ libunifyfs_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfs_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) libunifyfs_la_LIBADD = $(CLIENT_COMMON_LIBADD) +if USE_PMPI_WRAPPERS +libunifyfs_mpi_la_SOURCES = $(PMPI_SRC_FILES) +libunifyfs_mpi_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) +libunifyfs_mpi_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) +libunifyfs_mpi_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) +libunifyfs_mpi_la_LIBADD = libunifyfs.la +endif #USE_PMPI_WRAPPERS + + if HAVE_GOTCHA libunifyfs_gotcha_la_SOURCES = \ @@ -104,16 +130,22 @@ libunifyfs_gotcha_la_SOURCES = \ gotcha_map_unifyfs_list.c libunifyfs_gotcha_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) -DUNIFYFS_GOTCHA libunifyfs_gotcha_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) $(GOTCHA_CFLAGS) -libunifyfs_gotcha_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) $(GOTCHA_LDFLAGS) -libunifyfs_gotcha_la_LIBADD = $(CLIENT_COMMON_LIBADD) -lgotcha +libunifyfs_gotcha_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) +libunifyfs_gotcha_la_LIBADD = $(CLIENT_COMMON_LIBADD) $(GOTCHA_LIBS) -endif +if USE_PMPI_WRAPPERS +libunifyfs_mpi_gotcha_la_SOURCES = $(PMPI_SRC_FILES) +libunifyfs_mpi_gotcha_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) +libunifyfs_mpi_gotcha_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) +libunifyfs_mpi_gotcha_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) +libunifyfs_mpi_gotcha_la_LIBADD = libunifyfs_gotcha.la +endif #USE_PMPI_WRAPPERS if HAVE_FORTRAN - libunifyfsf_la_SOURCES = unifyfsf.c libunifyfsf_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) libunifyfsf_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfsf_la_LIBADD = libunifyfs_gotcha.la +endif #HAVE_FORTRAN -endif +endif #HAVE_GOTCHA diff --git a/client/src/unifyfs.c b/client/src/unifyfs.c index 87edfcbd3..8b2a93bb9 100644 --- a/client/src/unifyfs.c +++ b/client/src/unifyfs.c @@ -50,9 +50,9 @@ #include "unifyfs_rpc_util.h" #include "margo_client.h" -#ifdef HAVE_SPATH +#ifdef USE_SPATH #include "spath.h" -#endif /* HAVE_SPATH */ +#endif /* USE_SPATH */ /* avoid duplicate mounts (for now) */ int unifyfs_mounted = -1; @@ -279,13 +279,13 @@ static void unifyfs_normalize_path(const char* path, char* normalized) snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", path); } -#ifdef HAVE_SPATH +#ifdef USE_SPATH /* normalize path to handle '.', '..', * and extra or trailing '/' characters */ char* str = spath_strdup_reduce_str(normalized); snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", str); free(str); -#endif /* HAVE_SPATH */ +#endif /* USE_SPATH */ } /* Given a path, which may relative or absoluate, diff --git a/configure.ac b/configure.ac index 77719eb87..54b3daca4 100755 --- a/configure.ac +++ b/configure.ac @@ -163,7 +163,7 @@ AS_IF([test "$have_C_mpi" != "yes"], [] ) -# look for gotcha library, sets GOTCHA_INCLUDE, GOTCHA_LIB +# look for gotcha library, sets GOTCHA_CFLAGS, GOTCHA_LDFLAGS, GOTCHA_LIBS UNIFYFS_AC_GOTCHA # error out if fortran was enabled but GOTCHA wasn't found diff --git a/examples/src/Makefile.am b/examples/src/Makefile.am index d503e3cf9..215f9dee1 100644 --- a/examples/src/Makefile.am +++ b/examples/src/Makefile.am @@ -1,384 +1,263 @@ +testutil_headers = \ + testutil.h \ + testutil_rdwr.h + +noinst_HEADERS = \ + $(testutil_headers) \ + testlib.h + libexec_PROGRAMS = \ cr-posix \ read-posix \ write-posix \ - writeread-posix \ - sysio-writeread-posix + writeread-posix if HAVE_LD_WRAP - libexec_PROGRAMS += \ - cr-static \ - read-static \ - write-static \ - writeread-static \ - sysio-write-static \ - sysio-read-static \ - sysio-writeread-static \ - sysio-writeread2-static \ - sysio-dir-static \ - sysio-stat-static \ - sysio-cp-static \ - sysio-truncate-static \ - sysio-unlink-static \ - sysio-open-static \ - app-mpiio-static \ +libexec_PROGRAMS += \ app-btio-static \ + app-mpiio-static \ app-tileio-static \ - transfer-static \ - size-static \ - simul-static \ chmod-static \ + cr-static \ multi-write-static \ - read-data-static -endif + read-static \ + read-data-static \ + simul-static \ + size-static \ + transfer-static \ + write-static \ + writeread-static +endif #HAVE_LD_WRAP if HAVE_GOTCHA - libexec_PROGRAMS += \ - cr-gotcha \ - read-gotcha \ - write-gotcha \ - writeread-gotcha \ - sysio-write-gotcha \ - sysio-read-gotcha \ - sysio-writeread-gotcha \ - sysio-writeread2-gotcha \ - sysio-dir-gotcha \ - sysio-stat-gotcha \ - sysio-cp-gotcha \ - sysio-truncate-gotcha \ - sysio-unlink-gotcha \ - sysio-open-gotcha \ - app-mpiio-gotcha \ - app-btio-gotcha \ - app-tileio-gotcha \ - transfer-gotcha \ - size-gotcha \ - simul-gotcha \ - chmod-gotcha \ - multi-write-gotcha \ - read-data-gotcha + +libexec_PROGRAMS += \ + app-btio-gotcha \ + app-mpiio-gotcha \ + app-tileio-gotcha \ + chmod-gotcha \ + cr-gotcha \ + multi-write-gotcha \ + read-gotcha \ + read-data-gotcha \ + simul-gotcha \ + size-gotcha \ + transfer-gotcha \ + write-gotcha \ + writeread-gotcha if HAVE_HDF5 - libexec_PROGRAMS += \ - app-hdf5-create-gotcha \ - app-hdf5-writeread-gotcha +libexec_PROGRAMS += \ + app-hdf5-create-gotcha \ + app-hdf5-writeread-gotcha endif #HAVE_HDF5 if HAVE_FORTRAN libexec_PROGRAMS += \ writeread-ftn endif #HAVE_FORTRAN + endif #HAVE_GOTCHA CLEANFILES = $(libexec_PROGRAMS) -noinst_HEADERS = \ - testlib.h \ - testutil.h \ - testutil_rdwr.h # Common compile/link flag definitions -test_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ - -I$(top_srcdir)/client/src -I$(top_srcdir)/common/src +ex_includes = -I$(top_srcdir)/client/src -I$(top_srcdir)/common/src + +ex_cppflags = $(AM_CPPFLAGS) $(ex_includes) +ex_mpi_cppflags = $(ex_cppflags) $(MPI_CFLAGS) if USE_PMPI_WRAPPERS -test_cppflags += -DENABLE_MPI_MOUNT -endif +ex_mpi_cppflags += -DENABLE_MPI_MOUNT +ex_gotcha_lib = \ + $(top_builddir)/client/src/libunifyfs_mpi_gotcha.la \ + $(top_builddir)/client/src/libunifyfs_gotcha.la +ex_static_lib = \ + $(top_builddir)/client/src/libunifyfs_mpi.la \ + $(top_builddir)/client/src/libunifyfs.la +else +ex_gotcha_lib = $(top_builddir)/client/src/libunifyfs_gotcha.la +ex_static_lib = $(top_builddir)/client/src/libunifyfs.la +endif #USE_PMPI_WRAPPERS if HAVE_FORTRAN -test_ftn_flags = $(AM_FCFLAGS) $(MPI_FFLAGS) \ - -I$(top_srcdir)/client/src -I$(top_srcdir)/common/src -test_ftn_ldadd = $(top_builddir)/client/src/libunifyfsf.la -lrt -lm $(FCLIBS) -test_ftn_ldflags = $(AM_LDFLAGS) $(MPI_FLDFLAGS) -endif - -test_gotcha_ldadd = $(top_builddir)/client/src/libunifyfs_gotcha.la -lrt -lm -test_gotcha_ldflags = $(AM_LDFLAGS) $(MPI_CLDFLAGS) -test_posix_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) -DDISABLE_UNIFYFS -test_posix_ldadd = -lrt -lm -test_posix_ldflags = $(AM_LDFLAGS) $(MPI_CLDFLAGS) +ex_ftn_flags = $(AM_FCFLAGS) $(ex_includes) +ex_ftn_mpi_flags = $(ex_ftn_flags) $(MPI_FFLAGS) -test_static_ldadd = $(top_builddir)/client/src/libunifyfs.la -lrt -lm -test_static_ldflags = -static $(CP_WRAPPERS) $(AM_LDFLAGS) $(MPI_CLDFLAGS) +ex_ftn_ldadd = \ + $(top_builddir)/client/src/libunifyfsf.la \ + -lrt -lm \ + $(FCLIBS) -# Per-target flags begin here +ex_ftn_mpi_ldadd = \ + $(ex_ftn_ldadd) \ + $(MPI_FLDFLAGS) -sysio_write_gotcha_SOURCES = sysio-write.c -sysio_write_gotcha_CPPFLAGS = $(test_cppflags) -sysio_write_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_write_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_write_static_SOURCES = sysio-write.c -sysio_write_static_CPPFLAGS = $(test_cppflags) -sysio_write_static_LDADD = $(test_static_ldadd) -sysio_write_static_LDFLAGS = $(test_static_ldflags) - -sysio_read_gotcha_SOURCES = sysio-read.c -sysio_read_gotcha_CPPFLAGS = $(test_cppflags) -sysio_read_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_read_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_read_static_SOURCES = sysio-read.c -sysio_read_static_CPPFLAGS = $(test_cppflags) -sysio_read_static_LDADD = $(test_static_ldadd) -sysio_read_static_LDFLAGS = $(test_static_ldflags) - -sysio_writeread_posix_SOURCES = sysio-writeread.c -sysio_writeread_posix_CPPFLAGS = $(test_posix_cppflags) -sysio_writeread_posix_LDADD = $(test_posix_ldadd) -sysio_writeread_posix_LDFLAGS = $(test_posix_ldflags) - -sysio_writeread_gotcha_SOURCES = sysio-writeread.c -sysio_writeread_gotcha_CPPFLAGS = $(test_cppflags) -sysio_writeread_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_writeread_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_writeread_static_SOURCES = sysio-writeread.c -sysio_writeread_static_CPPFLAGS = $(test_cppflags) -sysio_writeread_static_LDADD = $(test_static_ldadd) -sysio_writeread_static_LDFLAGS = $(test_static_ldflags) - -sysio_writeread2_gotcha_SOURCES = sysio-writeread2.c -sysio_writeread2_gotcha_CPPFLAGS = $(test_cppflags) -sysio_writeread2_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_writeread2_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_writeread2_static_SOURCES = sysio-writeread2.c -sysio_writeread2_static_CPPFLAGS = $(test_cppflags) -sysio_writeread2_static_LDADD = $(test_static_ldadd) -sysio_writeread2_static_LDFLAGS = $(test_static_ldflags) - -sysio_dir_gotcha_SOURCES = sysio-dir.c -sysio_dir_gotcha_CPPFLAGS = $(test_cppflags) -sysio_dir_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_dir_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_dir_static_SOURCES = sysio-dir.c -sysio_dir_static_CPPFLAGS = $(test_cppflags) -sysio_dir_static_LDADD = $(test_static_ldadd) -sysio_dir_static_LDFLAGS = $(test_static_ldflags) - -sysio_stat_gotcha_SOURCES = sysio-stat.c -sysio_stat_gotcha_CPPFLAGS = $(test_cppflags) -sysio_stat_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_stat_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_stat_static_SOURCES = sysio-stat.c -sysio_stat_static_CPPFLAGS = $(test_cppflags) -sysio_stat_static_LDADD = $(test_static_ldadd) -sysio_stat_static_LDFLAGS = $(test_static_ldflags) - -sysio_cp_gotcha_SOURCES = sysio-cp.c -sysio_cp_gotcha_CPPFLAGS = $(test_cppflags) -sysio_cp_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_cp_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_cp_static_SOURCES = sysio-cp.c -sysio_cp_static_CPPFLAGS = $(test_cppflags) -sysio_cp_static_LDADD = $(test_static_ldadd) -sysio_cp_static_LDFLAGS = $(test_static_ldflags) - -sysio_truncate_gotcha_SOURCES = sysio-truncate.c -sysio_truncate_gotcha_CPPFLAGS = $(test_cppflags) -sysio_truncate_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_truncate_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_truncate_static_SOURCES = sysio-truncate.c -sysio_truncate_static_CPPFLAGS = $(test_cppflags) -sysio_truncate_static_LDADD = $(test_static_ldadd) -sysio_truncate_static_LDFLAGS = $(test_static_ldflags) - -sysio_unlink_gotcha_SOURCES = sysio-unlink.c -sysio_unlink_gotcha_CPPFLAGS = $(test_cppflags) -sysio_unlink_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_unlink_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_unlink_static_SOURCES = sysio-unlink.c -sysio_unlink_static_CPPFLAGS = $(test_cppflags) -sysio_unlink_static_LDADD = $(test_static_ldadd) -sysio_unlink_static_LDFLAGS = $(test_static_ldflags) - -sysio_open_gotcha_SOURCES = sysio-open.c -sysio_open_gotcha_CPPFLAGS = $(test_cppflags) -sysio_open_gotcha_LDADD = $(test_gotcha_ldadd) -sysio_open_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -sysio_open_static_SOURCES = sysio-open.c -sysio_open_static_CPPFLAGS = $(test_cppflags) -sysio_open_static_LDADD = $(test_static_ldadd) -sysio_open_static_LDFLAGS = $(test_static_ldflags) - -cr_posix_SOURCES = checkpoint-restart.c -cr_posix_CPPFLAGS = $(test_posix_cppflags) -cr_posix_LDADD = $(test_posix_ldadd) -cr_posix_LDFLAGS = $(test_posix_ldflags) - -cr_gotcha_SOURCES = checkpoint-restart.c -cr_gotcha_CPPFLAGS = $(test_cppflags) -cr_gotcha_LDADD = $(test_gotcha_ldadd) -cr_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -cr_static_SOURCES = checkpoint-restart.c -cr_static_CPPFLAGS = $(test_cppflags) -cr_static_LDADD = $(test_static_ldadd) -cr_static_LDFLAGS = $(test_static_ldflags) - -read_posix_SOURCES = read.c -read_posix_CPPFLAGS = $(test_posix_cppflags) -read_posix_LDADD = $(test_posix_ldadd) -read_posix_LDFLAGS = $(test_posix_ldflags) - -read_gotcha_SOURCES = read.c -read_gotcha_CPPFLAGS = $(test_cppflags) -read_gotcha_LDADD = $(test_gotcha_ldadd) -read_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -read_static_SOURCES = read.c -read_static_CPPFLAGS = $(test_cppflags) -read_static_LDADD = $(test_static_ldadd) -read_static_LDFLAGS = $(test_static_ldflags) - -write_posix_SOURCES = write.c -write_posix_CPPFLAGS = $(test_posix_cppflags) -write_posix_LDADD = $(test_posix_ldadd) -write_posix_LDFLAGS = $(test_posix_ldflags) - -write_gotcha_SOURCES = write.c -write_gotcha_CPPFLAGS = $(test_cppflags) -write_gotcha_LDADD = $(test_gotcha_ldadd) -write_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -write_static_SOURCES = write.c -write_static_CPPFLAGS = $(test_cppflags) -write_static_LDADD = $(test_static_ldadd) -write_static_LDFLAGS = $(test_static_ldflags) - -writeread_posix_SOURCES = writeread.c testutil.c -writeread_posix_CPPFLAGS = $(test_posix_cppflags) -writeread_posix_LDADD = $(test_posix_ldadd) -writeread_posix_LDFLAGS = $(test_posix_ldflags) - -writeread_gotcha_SOURCES = writeread.c testutil.c -writeread_gotcha_CPPFLAGS = $(test_cppflags) -writeread_gotcha_LDADD = $(test_gotcha_ldadd) -writeread_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -writeread_static_SOURCES = writeread.c testutil.c -writeread_static_CPPFLAGS = $(test_cppflags) -writeread_static_LDADD = $(test_static_ldadd) -writeread_static_LDFLAGS = $(test_static_ldflags) +endif #HAVE_FORTRAN -app_mpiio_gotcha_SOURCES = app-mpiio.c -app_mpiio_gotcha_CPPFLAGS = $(test_cppflags) -app_mpiio_gotcha_LDADD = $(test_gotcha_ldadd) -app_mpiio_gotcha_LDFLAGS = $(test_gotcha_ldflags) +if HAVE_HDF5 +ex_hdf_ldadd = $(HDF5_LDFLAGS) $(HDF5_LIBS) +endif #HAVE_HDF5 -app_mpiio_static_SOURCES = app-mpiio.c -app_mpiio_static_CPPFLAGS = $(test_cppflags) -app_mpiio_static_LDADD = $(test_static_ldadd) -app_mpiio_static_LDFLAGS = $(test_static_ldflags) +ex_gotcha_ldadd = $(ex_gotcha_lib) -lrt -lm +ex_gotcha_mpi_ldadd = $(ex_gotcha_ldadd) $(MPI_CLDFLAGS) -app_btio_gotcha_SOURCES = app-btio.c -app_btio_gotcha_CPPFLAGS = $(test_cppflags) -app_btio_gotcha_LDADD = $(test_gotcha_ldadd) -app_btio_gotcha_LDFLAGS = $(test_gotcha_ldflags) +ex_posix_cppflags = $(AM_CPPFLAGS) -DDISABLE_UNIFYFS +ex_posix_mpi_cppflags = $(ex_posix_cppflags) $(MPI_CFLAGS) -app_btio_static_SOURCES = app-btio.c -app_btio_static_CPPFLAGS = $(test_cppflags) -app_btio_static_LDADD = $(test_static_ldadd) -app_btio_static_LDFLAGS = $(test_static_ldflags) +ex_posix_ldadd = -lrt -lm +ex_posix_mpi_ldadd = $(ex_posix_ldadd) $(MPI_CLDFLAGS) -app_tileio_gotcha_SOURCES = app-tileio.c -app_tileio_gotcha_CPPFLAGS = $(test_cppflags) -app_tileio_gotcha_LDADD = $(test_gotcha_ldadd) -app_tileio_gotcha_LDFLAGS = $(test_gotcha_ldflags) +ex_static_ldadd = $(ex_static_lib) -lrt -lm +ex_static_mpi_ldadd = $(ex_static_ldadd) $(MPI_CLDFLAGS) +ex_static_ldflags = $(AM_LDFLAGS) -static $(CP_WRAPPERS) -app_tileio_static_SOURCES = app-tileio.c -app_tileio_static_CPPFLAGS = $(test_cppflags) -app_tileio_static_LDADD = $(test_static_ldadd) -app_tileio_static_LDFLAGS = $(test_static_ldflags) - -if HAVE_FORTRAN - -writeread_ftn_SOURCES = writeread.f90 -writeread_ftn_FCFLAGS = $(test_ftn_flags) -writeread_ftn_LDADD = $(test_ftn_ldadd) -writeread_ftn_LDFLAGS = $(test_ftn_ldflags) +# Per-target flags begin here -endif +app_btio_gotcha_SOURCES = app-btio.c +app_btio_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +app_btio_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) -if HAVE_HDF5 +app_btio_static_SOURCES = app-btio.c +app_btio_static_CPPFLAGS = $(ex_mpi_cppflags) +app_btio_static_LDADD = $(ex_static_mpi_ldadd) +app_btio_static_LDFLAGS = $(ex_static_ldflags) app_hdf5_create_gotcha_SOURCES = app-hdf5-create.c -app_hdf5_create_gotcha_CPPFLAGS = $(test_cppflags) $(HDF5_CPPFLAGS) -app_hdf5_create_gotcha_LDADD = $(test_gotcha_ldadd) $(HDF5_LIBS) -app_hdf5_create_gotcha_LDFLAGS = $(test_gotcha_ldflags) $(HDF5_LDFLAGS) +app_hdf5_create_gotcha_CPPFLAGS = $(ex_mpi_cppflags) $(HDF5_CPPFLAGS) +app_hdf5_create_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) $(ex_hdf_ldadd) app_hdf5_writeread_gotcha_SOURCES = app-hdf5-writeread.c -app_hdf5_writeread_gotcha_CPPFLAGS = $(test_cppflags) $(HDF5_CPPFLAGS) -app_hdf5_writeread_gotcha_LDADD = $(test_gotcha_ldadd) $(HDF5_LIBS) -app_hdf5_writeread_gotcha_LDFLAGS = $(test_gotcha_ldflags) $(HDF5_LDFLAGS) - -endif +app_hdf5_writeread_gotcha_CPPFLAGS = $(ex_mpi_cppflags) $(HDF5_CPPFLAGS) +app_hdf5_writeread_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) $(ex_hdf_ldadd) -transfer_gotcha_SOURCES = transfer.c -transfer_gotcha_CPPFLAGS = $(test_cppflags) -transfer_gotcha_LDADD = $(test_gotcha_ldadd) -transfer_gotcha_LDFLAGS = $(test_gotcha_ldflags) +app_mpiio_gotcha_SOURCES = app-mpiio.c +app_mpiio_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +app_mpiio_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) -transfer_static_SOURCES = transfer.c -transfer_static_CPPFLAGS = $(test_cppflags) -transfer_static_LDADD = $(test_static_ldadd) -transfer_static_LDFLAGS = $(test_static_ldflags) +app_mpiio_static_SOURCES = app-mpiio.c +app_mpiio_static_CPPFLAGS = $(ex_mpi_cppflags) +app_mpiio_static_LDADD = $(ex_static_mpi_ldadd) +app_mpiio_static_LDFLAGS = $(ex_static_ldflags) -size_gotcha_SOURCES = size.c testutil.c -size_gotcha_CPPFLAGS = $(test_cppflags) -size_gotcha_LDADD = $(test_gotcha_ldadd) -size_gotcha_LDFLAGS = $(test_gotcha_ldflags) +app_tileio_gotcha_SOURCES = app-tileio.c +app_tileio_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +app_tileio_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) -size_static_SOURCES = size.c testutil.c -size_static_CPPFLAGS = $(test_cppflags) -size_static_LDADD = $(test_static_ldadd) -size_static_LDFLAGS = $(test_static_ldflags) +app_tileio_static_SOURCES = app-tileio.c +app_tileio_static_CPPFLAGS = $(ex_mpi_cppflags) +app_tileio_static_LDADD = $(ex_static_mpi_ldadd) +app_tileio_static_LDFLAGS = $(ex_static_ldflags) + +chmod_gotcha_SOURCES = chmod.c testutil.c $(testutil_headers) +chmod_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +chmod_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +chmod_static_SOURCES = chmod.c testutil.c $(testutil_headers) +chmod_static_CPPFLAGS = $(ex_mpi_cppflags) +chmod_static_LDADD = $(ex_static_mpi_ldadd) +chmod_static_LDFLAGS = $(ex_static_ldflags) + +cr_posix_SOURCES = checkpoint-restart.c $(testutil_headers) +cr_posix_CPPFLAGS = $(ex_posix_mpi_cppflags) +cr_posix_LDADD = $(ex_posix_mpi_ldadd) + +cr_gotcha_SOURCES = checkpoint-restart.c $(testutil_headers) +cr_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +cr_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +cr_static_SOURCES = checkpoint-restart.c $(testutil_headers) +cr_static_CPPFLAGS = $(ex_mpi_cppflags) +cr_static_LDADD = $(ex_static_mpi_ldadd) +cr_static_LDFLAGS = $(ex_static_ldflags) + +multi_write_gotcha_SOURCES = multi-write.c testutil.c $(testutil_headers) +multi_write_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +multi_write_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +multi_write_static_SOURCES = multi-write.c testutil.c $(testutil_headers) +multi_write_static_CPPFLAGS = $(ex_mpi_cppflags) +multi_write_static_LDADD = $(ex_static_mpi_ldadd) +multi_write_static_LDFLAGS = $(ex_static_ldflags) + +read_posix_SOURCES = read.c $(testutil_headers) +read_posix_CPPFLAGS = $(ex_posix_mpi_cppflags) +read_posix_LDADD = $(ex_posix_mpi_ldadd) + +read_gotcha_SOURCES = read.c $(testutil_headers) +read_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +read_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +read_static_SOURCES = read.c $(testutil_headers) +read_static_CPPFLAGS = $(ex_mpi_cppflags) +read_static_LDADD = $(ex_static_mpi_ldadd) +read_static_LDFLAGS = $(ex_static_ldflags) + +read_data_gotcha_SOURCES = read-data.c $(testutil_headers) +read_data_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +read_data_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +read_data_static_SOURCES = read-data.c $(testutil_headers) +read_data_static_CPPFLAGS = $(ex_mpi_cppflags) +read_data_static_LDADD = $(ex_static_mpi_ldadd) +read_data_static_LDFLAGS = $(ex_static_ldflags) simul_gotcha_SOURCES = simul.c -simul_gotcha_CPPFLAGS = $(test_cppflags) -simul_gotcha_LDADD = $(test_gotcha_ldadd) -simul_gotcha_LDFLAGS = $(test_gotcha_ldflags) +simul_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +simul_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) simul_static_SOURCES = simul.c -simul_static_CPPFLAGS = $(test_cppflags) -simul_static_LDADD = $(test_static_ldadd) -simul_static_LDFLAGS = $(test_static_ldflags) - -chmod_gotcha_SOURCES = chmod.c testutil.c -chmod_gotcha_CPPFLAGS = $(test_cppflags) -chmod_gotcha_LDADD = $(test_gotcha_ldadd) -chmod_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -chmod_static_SOURCES = chmod.c testutil.c -chmod_static_CPPFLAGS = $(test_cppflags) -chmod_static_LDADD = $(test_static_ldadd) -chmod_static_LDFLAGS = $(test_static_ldflags) - -multi_write_gotcha_SOURCES = multi-write.c testutil.c -multi_write_gotcha_CPPFLAGS = $(test_cppflags) -multi_write_gotcha_LDADD = $(test_gotcha_ldadd) -multi_write_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -multi_write_static_SOURCES = multi-write.c testutil.c -multi_write_static_CPPFLAGS = $(test_cppflags) -multi_write_static_LDADD = $(test_static_ldadd) -multi_write_static_LDFLAGS = $(test_static_ldflags) - -read_data_gotcha_SOURCES = read-data.c -read_data_gotcha_CPPFLAGS = $(test_cppflags) -read_data_gotcha_LDADD = $(test_gotcha_ldadd) -read_data_gotcha_LDFLAGS = $(test_gotcha_ldflags) - -read_data_static_SOURCES = read-data.c -read_data_static_CPPFLAGS = $(test_cppflags) -read_data_static_LDADD = $(test_static_ldadd) -read_data_static_LDFLAGS = $(test_static_ldflags) +simul_static_CPPFLAGS = $(ex_mpi_cppflags) +simul_static_LDADD = $(ex_static_mpi_ldadd) +simul_static_LDFLAGS = $(ex_static_ldflags) + +size_gotcha_SOURCES = size.c testutil.c $(testutil_headers) +size_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +size_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +size_static_SOURCES = size.c testutil.c $(testutil_headers) +size_static_CPPFLAGS = $(ex_mpi_cppflags) +size_static_LDADD = $(ex_static_mpi_ldadd) +size_static_LDFLAGS = $(ex_static_ldflags) + +transfer_gotcha_SOURCES = transfer.c +transfer_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +transfer_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +transfer_static_SOURCES = transfer.c +transfer_static_CPPFLAGS = $(ex_mpi_cppflags) +transfer_static_LDADD = $(ex_static_mpi_ldadd) +transfer_static_LDFLAGS = $(ex_static_ldflags) + +write_posix_SOURCES = write.c $(testutil_headers) +write_posix_CPPFLAGS = $(ex_posix_mpi_cppflags) +write_posix_LDADD = $(ex_posix_mpi_ldadd) + +write_gotcha_SOURCES = write.c $(testutil_headers) +write_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +write_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +write_static_SOURCES = write.c $(testutil_headers) +write_static_CPPFLAGS = $(ex_mpi_cppflags) +write_static_LDADD = $(ex_static_mpi_ldadd) +write_static_LDFLAGS = $(ex_static_ldflags) + +writeread_posix_SOURCES = writeread.c testutil.c $(testutil_headers) +writeread_posix_CPPFLAGS = $(ex_posix_mpi_cppflags) +writeread_posix_LDADD = $(ex_posix_mpi_ldadd) + +writeread_ftn_SOURCES = writeread.f90 +writeread_ftn_FCFLAGS = $(ex_ftn_mpi_flags) +writeread_ftn_LDADD = $(ex_ftn_mpi_ldadd) + +writeread_gotcha_SOURCES = writeread.c testutil.c $(testutil_headers) +writeread_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +writeread_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +writeread_static_SOURCES = writeread.c testutil.c $(testutil_headers) +writeread_static_CPPFLAGS = $(ex_mpi_cppflags) +writeread_static_LDADD = $(ex_static_mpi_ldadd) +writeread_static_LDFLAGS = $(ex_static_ldflags) diff --git a/examples/src/sysio-cp.c b/examples/src/sysio-cp.c deleted file mode 100644 index 0c04bf1ec..000000000 --- a/examples/src/sysio-cp.c +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -static int rank; -static int total_ranks; -static int rank_worker; -static int debug; - -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static int unmount; /* unmount unifyfs after running the test */ - -static char* srcpath; -static char* dstpath; - -static unsigned long bufsize = 64 * (1 << 10); - -static int resolve_dstpath(char* path) -{ - int ret = 0; - struct stat sb = { 0, }; - char* tmp = NULL; - char* tmpdir = NULL; - char* filename = basename(srcpath); /* do not free(3) */ - - if (path[strlen(path) - 1] == '/') { - path[strlen(path) - 1] = '\0'; - } - - if (path[0] != '/') { - tmp = realpath(path, NULL); - if (!tmp) { - return errno; - } - } else { - tmp = strdup(path); - } - - ret = stat(tmp, &sb); - if (ret == 0 && S_ISDIR(sb.st_mode)) { - dstpath = calloc(1, strlen(tmp) + strlen(filename) + 2); - if (!dstpath) { - return ENOMEM; - } - - sprintf(dstpath, "%s/%s", tmp, filename); - free(tmp); - } else { - dstpath = tmp; /* further error will be resolved when open(2) */ - } - - return 0; -} - -static int do_copy(void) -{ - int ret = 0; - int fd_src = 0; - int fd_dst = 0; - ssize_t n_read = 0; - ssize_t n_written = 0; - ssize_t n_left = 0; - char* buf = malloc(bufsize); - - if (!buf) { - return ENOMEM; - } - - fd_src = open(srcpath, O_RDONLY); - if (fd_src < 0) { - return errno; - } - - fd_dst = open(dstpath, O_CREAT | O_WRONLY | O_TRUNC, 0644); - if (fd_dst < 0) { - ret = errno; - goto out_close_src; - } - - while (1) { - n_read = read(fd_src, buf, bufsize); - if (n_read == 0) { /* EOF */ - break; - } - - if (n_read < 0) { /* error */ - ret = errno; - goto out_close_dst; - } - - n_left = n_read; - - do { - n_written = write(fd_dst, buf, n_left); - if (n_written < 0) { - ret = errno; - goto out_close_dst; - } - - if (n_written == 0 && errno && errno != EAGAIN) { - ret = errno; - goto out_close_dst; - } - - n_left -= n_written; - } while (n_left); - } - - fsync(fd_dst); - -out_close_dst: - close(fd_dst); -out_close_src: - close(fd_src); - - return ret; -} - -static struct option const long_opts[] = { - { "bufsize", 1, 0, 'b' }, - { "debug", 0, 0, 'd' }, - { "help", 0, 0, 'h' }, - { "mount", 1, 0, 'm' }, - { "rank", 1, 0, 'r' }, - { "unmount", 0, 0, 'u' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "b:dhm:r:u"; - -static const char* usage_str = - "\n" - "Usage: %s [options...] \n" - "\n" - "Available options:\n" - " -b, --bufsize= use for copy buffer\n" - " (default: 64KB)\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -h, --help help message\n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -r, --rank= use to copy the file (default: 0)\n" - " -u, --unmount unmount the filesystem after test\n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 0; - struct stat sb = { 0, }; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'b': - bufsize = strtoul(optarg, NULL, 0); - break; - - case 'd': - debug = 1; - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 'r': - rank_worker = atoi(optarg); - break; - - case 'u': - unmount = 1; - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (argc - optind != 2) { - print_usage(); - } - - srcpath = strdup(argv[optind++]); - - if (srcpath[strlen(srcpath) - 1] == '/') { - srcpath[strlen(srcpath) - 1] = '\0'; - } - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - goto out; - } - - if (rank_worker >= total_ranks) { - test_print(rank, "%d is not a valid rank"); - goto out; - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (rank != rank_worker) { - goto donothing; - } - - ret = stat(srcpath, &sb); - if (ret < 0) { - test_print(rank, "stat failed on \"%s\"", srcpath); - goto out; - } - - ret = resolve_dstpath(argv[optind]); - if (ret) { - test_print(rank, "cannot resolve the destination path \"%s\" (%s)", - dstpath, strerror(ret)); - goto out; - } - - ret = do_copy(); - if (ret) { - test_print(rank, "copy failed (%d: %s)", ret, strerror(ret)); - } - - free(dstpath); - free(srcpath); - -donothing: - MPI_Barrier(MPI_COMM_WORLD); - - if (unmount) { - unifyfs_unmount(); - } - -out: - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-dir.c b/examples/src/sysio-dir.c deleted file mode 100644 index 5b37bf26d..000000000 --- a/examples/src/sysio-dir.c +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -static int standard; /* not mounting unifyfs when set */ -static int synchronous; /* sync metadata for each op? (default: no)*/ - -static int rank; -static int total_ranks; - -static int debug; /* pause for attaching debugger */ -static int unmount; /* unmount unifyfs after running the test */ -static uint64_t count = 10; /* number of directories each rank creates */ -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static char* testdir = "testdir"; /* test directory under mountpoint */ -static char targetdir[NAME_MAX]; /* target file name */ - -static char dirnamebuf[NAME_MAX]; - -static int do_mkdir(void) -{ - int ret = 0; - uint64_t i = 0; - mode_t mode = 0700; - struct stat sb = { 0, }; - - ret = stat(targetdir, &sb); - if (ret < 0 && errno == ENOENT) { - ret = mkdir(targetdir, mode); - if (ret < 0) { - perror("mkdir"); - return -1; - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - sprintf(dirnamebuf, "%s/rank-%d", targetdir, rank); - - ret = mkdir(dirnamebuf, mode); - if (ret < 0) { - test_print(rank, "mkdir failed for %s", dirnamebuf); - ret = -errno; - goto out; - } - - for (i = 0; i < count; i++) { - sprintf(dirnamebuf, "%s/rank-%d/dir-%lu", targetdir, rank, i); - - ret = mkdir(dirnamebuf, mode); - if (ret < 0) { - test_print(rank, "mkdir failed for %s", dirnamebuf); - ret = -errno; - goto out; - } - } - -out: - return ret; -} - -static int do_stat(void) -{ - int ret = 0; - uint64_t i = 0; - struct stat sb = { 0, }; - - sprintf(dirnamebuf, "%s/rank-%d", targetdir, rank); - - ret = stat(dirnamebuf, &sb); - if (ret < 0) { - test_print(rank, "stat failed for %s", dirnamebuf); - ret = -errno; - goto out; - } - - for (i = 0; i < count; i++) { - sprintf(dirnamebuf, "%s/rank-%d/dir-%lu", targetdir, rank, i); - - ret = stat(dirnamebuf, &sb); - if (ret < 0) { - test_print(rank, "stat failed for %s", dirnamebuf); - ret = -errno; - goto out; - } - - /* print some fields.. */ - printf("\n## %s\n" - "ino: %lu\n" - "mode: %o\n" - "ctime: %lu\n" - "atime: %lu\n" - "mtime: %lu\n", - dirnamebuf, - sb.st_ino, sb.st_mode, - sb.st_ctime, sb.st_atime, sb.st_mtime); - } - -out: - return ret; -} - -static int do_readdir(void) -{ - return 0; -} - -static int do_rmdir(void) -{ - return 0; -} - -enum { - DIRTEST_ALL = 0, - DIRTEST_MKDIR, - DIRTEST_STAT, - DIRTEST_READDIR, - DIRTEST_RMDIR, - N_DIRTESTS, -}; - -static int singletest; - -static const char* singletest_names[N_DIRTESTS] = { - "all", "mkdir", "stat", "readdir", "rmdir" -}; - -static int set_singletest(const char* testname) -{ - int i = 0; - - if (singletest) { - fprintf(stderr, "Only a single test can be performed with " - "--singletest option.\n"); - exit(1); - } - - for (i = 0; i < N_DIRTESTS; i++) - if (strcmp(testname, singletest_names[i]) == 0) { - return i; - } - - fprintf(stderr, "%s is not a valid test name.\n", testname); - exit(1); -} - -typedef int (*dirtest_func_t)(void); - -static dirtest_func_t test_funcs[N_DIRTESTS] = { - 0, do_mkdir, do_stat, do_readdir, do_rmdir -}; - -static struct option const long_opts[] = { - { "debug", 0, 0, 'd' }, - { "dirname", 1, 0, 'D' }, - { "help", 0, 0, 'h' }, - { "mount", 1, 0, 'm' }, - { "count", 1, 0, 'n' }, - { "synchronous", 0, 0, 'S' }, - { "standard", 0, 0, 's' }, - { "singletest", 1, 0, 't' }, - { "unmount", 0, 0, 'u' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "dD:hm:n:Sst:u"; - -static const char* usage_str = - "\n" - "Usage: %s [options...]\n" - "\n" - "Available options:\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -D, --dirname= test directory name under mountpoint\n" - " (default: testdir)\n" - " -h, --help help message\n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -n, --count= number of directories that each rank will\n" - " create (default: 10)\n" - " -S, --synchronous sync metadata on each write\n" - " -s, --standard do not use unifyfs but run standard I/O\n" - " -t, --singletest= only test a single operation\n" - " (operations: mkdir, stat, readdir, rmdir)\n" - " -u, --unmount unmount the filesystem after test\n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 2; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'd': - debug = 1; - break; - - case 'D': - testdir = strdup(optarg); - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 'n': - count = strtoull(optarg, 0, 0); - break; - - case 'S': - synchronous = 1; - break; - - case 's': - standard = 1; - break; - - case 't': - singletest = set_singletest(optarg); - break; - - case 'u': - unmount = 1; - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (static_linked(program) && standard) { - test_print_once(rank, "--standard, -s option only works when " - "dynamically linked."); - exit(-1); - } - - sprintf(targetdir, "%s/%s", mountpoint, testdir); - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - if (!standard) { - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - exit(-1); - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (singletest) { - test_print_once(rank, "only testing %s ..", - singletest_names[singletest]); - ret = test_funcs[singletest](); - if (ret < 0) { - fprintf(stderr, "%s test failed.\n", singletest_names[singletest]); - goto out; - } - goto out_unmount; - } - - ret = do_mkdir(); - if (ret < 0) { - fprintf(stderr, "directory creation failed..\n"); - goto out; - } - - ret = do_stat(); - if (ret < 0) { - fprintf(stderr, "directory stat failed..\n"); - goto out; - } - - ret = do_readdir(); - if (ret < 0) { - fprintf(stderr, "directory read failed..\n"); - goto out; - } - - ret = do_rmdir(); - if (ret < 0) { - fprintf(stderr, "directory failed..\n"); - goto out; - } - - MPI_Barrier(MPI_COMM_WORLD); - -out_unmount: - if (!standard && unmount) { - unifyfs_unmount(); - } -out: - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-open.c b/examples/src/sysio-open.c deleted file mode 100644 index ca550a196..000000000 --- a/examples/src/sysio-open.c +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -static int fd; /* target file descriptor */ -static int standard; /* not mounting unifyfs when set */ - -static int rank; -static int total_ranks; - -static int create_rank; -static int open_rank; -static int do_stat; /* perform stat after closing the file */ -static int debug; /* pause for attaching debugger */ -static int exclusive; -static int trunc; -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static char* filename = "testfile"; /* testfile name under mountpoint */ -static char targetfile[NAME_MAX]; /* target file name */ - -static struct option long_opts[] = { - { "create", 1, 0, 'c' }, - { "debug", 0, 0, 'd' }, - { "exclusive", 0, 0, 'e' }, - { "filename", 1, 0, 'f' }, - { "help", 0, 0, 'h' }, - { "mount", 1, 0, 'm' }, - { "open", 1, 0, 'o' }, - { "standard", 0, 0, 's' }, - { "stat", 0, 0, 'S' }, - { "truncate", 0, 0, 't' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "c:def:hm:o:sSt"; - -static const char* usage_str = - "\n" - "Usage: %s [options...]\n" - "\n" - "Available options:\n" - " -c, --create= create the file from \n" - " (default: 0)\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -e, --exclusive pass O_EXCL to fail when the file exists\n" - " -f, --filename= target file name under mountpoint\n" - " (default: testfile)\n" - " -h, --help help message\n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -o, --open= open file from after create\n" - " (default: 0)\n" - " -s, --standard do not use unifyfs but run standard I/O\n" - " -S, --stat perform stat after closing\n" - " -t, --truncate truncate file if exists\n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 2; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'c': - create_rank = atoi(optarg); - break; - - case 'd': - debug = 1; - break; - - case 'e': - exclusive = 1; - break; - - case 'f': - filename = strdup(optarg); - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 'o': - open_rank = atoi(optarg); - break; - - case 's': - standard = 1; - break; - - case 'S': - do_stat = 1; - break; - - case 't': - trunc = 1; - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (static_linked(program) && standard) { - test_print_once(rank, "--standard, -s option only works when " - "dynamically linked."); - exit(-1); - } - - sprintf(targetfile, "%s/%s", mountpoint, filename); - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - if (exclusive && trunc) { - test_print_once(rank, "-e and -t cannot be used together."); - exit(-1); - } - - if (!standard) { - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - exit(-1); - } - } - - if ((create_rank < 0 || create_rank > total_ranks - 1) || - (open_rank < 0 || open_rank > total_ranks - 1)) { - test_print(rank, "please specify valid rank\n"); - exit(-1); - } - - MPI_Barrier(MPI_COMM_WORLD); - - /* create the file from the create_rank */ - if (rank == create_rank) { - int flags = O_CREAT|O_RDWR; - - if (exclusive) { - flags |= O_EXCL; - } else if (trunc) { - flags |= O_TRUNC; - } - - fd = open(targetfile, flags, 0600); - if (fd < 0) { - test_print(rank, "open failed (%d: %s)\n", errno, strerror(errno)); - } else { - test_print(rank, "created file %s successfully\n", targetfile); - close(fd); - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - errno = 0; - - /* open from all ranks (open_rank == 0) or a specific open_rank */ - if (!open_rank || rank == open_rank) { - fd = open(targetfile, O_RDWR); - if (fd < 0) { - test_print(rank, "open failed (%d: %s)\n", errno, strerror(errno)); - } else { - test_print(rank, "opened file %s successfully\n", targetfile); - close(fd); - } - - if (do_stat) { - struct stat sb = { 0, }; - - errno = 0; - ret = stat(targetfile, &sb); - if (ret < 0) { - test_print(rank, "stat failed (%d: %s)\n", - errno, strerror(errno)); - } else { - dump_stat(rank, &sb, targetfile); - } - } - } - - if (!standard) { - unifyfs_unmount(); - } - - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-read.c b/examples/src/sysio-read.c deleted file mode 100644 index fedbc431a..000000000 --- a/examples/src/sysio-read.c +++ /dev/null @@ -1,434 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -static uint64_t blocksize = 1 << 20; /* 1MB */ -static uint64_t nblocks = 128; /* Each process reads 128MB */ -static uint64_t chunksize = 64 * (1 << 10); /* 64KB for each read(2) call */ - -static int use_listio; /* use lio_listio(2) */ -static int use_pread; /* use pread(2) */ -static int pattern; /* N to 1 (N1, default) or N to N (NN) */ -static int fd; /* target file descriptor */ - -static int lipsum; /* check contents written by the write test. */ -static int standard; /* not mounting unifyfs when set */ - -/* time statistics */ -static struct timeval read_start, read_end; - -static int rank; -static int total_ranks; - -static int debug; /* pause for attaching debugger */ -static int unmount; /* unmount unifyfs after running the test */ -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static char* filename = "testfile"; /* testfile name under mountpoint */ -static char targetfile[NAME_MAX]; /* target file name */ - -static char* buf; /* I/O buffer */ -static uint64_t n_aiocb_list; /* number of aio requests */ -static struct aiocb** aiocb_list; /* aio request list */ -static struct aiocb* aiocb_items; /* aio requests */ - -static int do_read(void) -{ - int ret = 0; - uint64_t i, j, offset; - uint64_t nchunks = blocksize / chunksize; - - gettimeofday(&read_start, NULL); - - for (i = 0; i < nblocks; i++) { - for (j = 0; j < nchunks; j++) { - if (pattern == IO_PATTERN_N1) - offset = i * total_ranks * blocksize + rank * blocksize - + j * chunksize; - else { - offset = i * blocksize + j * chunksize; - } - - if (use_pread) { - ret = pread(fd, buf, chunksize, offset); - } else { - lseek(fd, offset, SEEK_SET); - ret = read(fd, buf, chunksize); - } - - if (ret < 0) { - test_print(rank, "%s failed", - use_pread ? "pread()" : "read()"); - return -1; - } - - if (lipsum) { - uint64_t epos = 0; - - ret = lipsum_check(buf, chunksize, offset, &epos); - if (ret < 0) { - test_print(rank, "lipsum check failed at offset %llu.", - (unsigned long long) epos); - return -1; - } - } - } - } - - gettimeofday(&read_end, NULL); - - return 0; -} - -static int do_listread(void) -{ - int ret = 0; - uint64_t i, j; - uint64_t nchunks = blocksize / chunksize; - uint64_t current_ix = 0; - struct aiocb* current = NULL; - - gettimeofday(&read_start, NULL); - - for (i = 0; i < nblocks; i++) { - for (j = 0; j < nchunks; j++) { - current_ix = i * nchunks + j; - - current = &aiocb_items[current_ix]; - aiocb_list[current_ix] = current; - - current->aio_fildes = fd; - current->aio_buf = &buf[current_ix * chunksize]; - current->aio_nbytes = chunksize; - current->aio_lio_opcode = LIO_READ; - - if (pattern == IO_PATTERN_N1) - current->aio_offset = i * total_ranks * blocksize - + rank * blocksize + j * chunksize; - else { - current->aio_offset = i * blocksize + j * chunksize; - } - } - } - - ret = lio_listio(LIO_WAIT, aiocb_list, n_aiocb_list, NULL); - if (ret < 0) { - test_print(rank, "lio_listio failed"); - return -1; - } - - if (lipsum) { - for (i = 0; i < nblocks * (blocksize / chunksize); i++) { - uint64_t epos = 0; - - current = &aiocb_items[i]; - - ret = lipsum_check((const char*) current->aio_buf, chunksize, - current->aio_offset, &epos); - if (ret < 0) { - test_print(rank, "lipsum check failed at offset %llu.", - (unsigned long long) epos); - return -1; - } - } - } - - gettimeofday(&read_end, NULL); - - return ret; -} - -static void report_result(void) -{ - double read_bw = .0F; - double agg_read_bw = .0F; - double max_read_time = .0F; - double min_read_bw = .0F; - double read_time = .0F; - - read_time = timediff_sec(&read_start, &read_end); - read_bw = 1.0 * blocksize * nblocks / read_time / (1 << 20); - - MPI_Reduce(&read_bw, &agg_read_bw, - 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - MPI_Reduce(&read_time, &max_read_time, - 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - - min_read_bw = 1.0 * blocksize * nblocks * total_ranks - / max_read_time / (1 << 20); - - test_print_once(rank, - "\n" - "Number of processes: %d\n" - "Each process wrote: %lf MB\n" - "Total reads: %lf MB\n" - "I/O pattern: %s\n" - "I/O request size: %llu B\n" - "Aggregate read bandwidth: %lf MB/s\n" - "Min. read bandwidth: %lf MB/s\n" - "Total Read time: %lf sec.\n", - total_ranks, - 1.0 * blocksize * nblocks / (1 << 20), - 1.0 * total_ranks * blocksize * nblocks / (1 << 20), - io_pattern_string(pattern), - chunksize, - agg_read_bw, - min_read_bw, - max_read_time); -} - -static struct option const long_opts[] = { - { "blocksize", 1, 0, 'b' }, - { "nblocks", 1, 0, 'n' }, - { "chunksize", 1, 0, 'c' }, - { "debug", 0, 0, 'd' }, - { "filename", 1, 0, 'f' }, - { "help", 0, 0, 'h' }, - { "lipsum", 0, 0, 'L' }, - { "listio", 0, 0, 'l' }, - { "mount", 1, 0, 'm' }, - { "pattern", 1, 0, 'p' }, - { "pread", 0, 0, 'P' }, - { "standard", 0, 0, 's' }, - { "unmount", 0, 0, 'u' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "b:n:c:df:hLlm:Pp:su"; - -static const char* usage_str = - "\n" - "Usage: %s [options...]\n" - "\n" - "Available options:\n" - " -b, --blocksize= logical block size for the target file\n" - " (default 1048576, 1MB)\n" - " -n, --nblocks= count of blocks each process will read\n" - " (default 128)\n" - " -c, --chunksize= I/O chunk size for each read operation\n" - " (default 64436, 64KB)\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -f, --filename= target file name under mountpoint\n" - " (default: testfile)\n" - " -h, --help help message\n" - " -L, --lipsum check contents written by write test\n" - " -l, --listio use lio_listio(2) instead of read(2)\n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -P, --pread use pread(2) instead of read(2)\n" - " -p, --pattern= should be 'n1'(n to 1) or 'nn' (n to n)\n" - " (default: n1)\n" - " -s, --standard do not use unifyfs but run standard I/O\n" - " -u, --unmount unmount the filesystem after test\n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 2; - uint64_t bufsize = 0; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'b': - blocksize = strtoull(optarg, NULL, 0); - break; - - case 'n': - nblocks = strtoull(optarg, NULL, 0); - break; - - case 'c': - chunksize = strtoull(optarg, NULL, 0); - break; - - case 'f': - filename = strdup(optarg); - break; - - case 'd': - debug = 1; - break; - - case 'L': - lipsum = 1; - break; - - case 'l': - use_listio = 1; - break; - - case 'P': - use_pread = 1; - break; - - case 'p': - pattern = read_io_pattern(optarg); - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 's': - standard = 1; - break; - - case 'u': - unmount = 1; - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (pattern < 0) { - test_print_once(rank, "pattern should be 'n1' or 'nn'"); - exit(-1); - } - - if (blocksize < chunksize || blocksize % chunksize > 0) { - test_print_once(rank, "blocksize should be larger than " - "and divisible by chunksize."); - exit(-1); - } - - if (chunksize % (1 << 10) > 0) { - test_print_once(rank, "chunksize and blocksize should be divisible " - "by 1024."); - exit(-1); - } - - if (static_linked(program) && standard) { - test_print_once(rank, "--standard, -s option only works when " - "dynamically linked."); - exit(-1); - } - - if (use_listio && use_pread) { - test_print_once(rank, - "--listio and --pread should be set exclusively"); - exit(-1); - } - - if (use_listio) { - bufsize = blocksize * nblocks; - n_aiocb_list = blocksize * nblocks / chunksize; - } else { - bufsize = chunksize; - } - - sprintf(targetfile, "%s/%s", mountpoint, filename); - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - if (!standard) { - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - exit(-1); - } - } - - buf = calloc(1, bufsize); - if (!buf) { - test_print(rank, "calloc failed"); - exit(-1); - } - - if (use_listio) { - aiocb_list = calloc(n_aiocb_list, sizeof(*aiocb_list)); - aiocb_items = calloc(n_aiocb_list, sizeof(*aiocb_items)); - - if (!aiocb_list || !aiocb_items) { - test_print(rank, "calloc failed"); - exit(-1); - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (pattern == IO_PATTERN_NN) { - sprintf(&targetfile[strlen(targetfile)], "-%d", rank); - } - - fd = open(targetfile, O_RDONLY, 0600); - if (fd < 0) { - test_print(rank, "open failed"); - exit(-1); - } - - ret = use_listio ? do_listread() : do_read(); - - close(fd); - - fflush(stdout); - - MPI_Barrier(MPI_COMM_WORLD); - - if (!standard && unmount) { - unifyfs_unmount(); - } - - if (ret == 0) { - report_result(); - } - - free(buf); - - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-stat.c b/examples/src/sysio-stat.c deleted file mode 100644 index 8251274da..000000000 --- a/examples/src/sysio-stat.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -static int rank; -static int total_ranks; -static int debug; - -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static char* filename = "/unifyfs"; -static int unmount; /* unmount unifyfs after running the test */ -static int testrank = -1; /* if negative, execute from all ranks */ - -static void do_stat(int rank) -{ - int ret = 0; - struct stat sb; - - ret = stat(filename, &sb); - if (ret < 0) { - test_print(rank, "stat failed on \"%s\" (%d:%s)", - filename, errno, strerror(errno)); - } else { - dump_stat(rank, &sb, filename); - } -} - -static struct option const long_opts[] = { - { "debug", 0, 0, 'd' }, - { "help", 0, 0, 'h' }, - { "mount", 1, 0, 'm' }, - { "unmount", 0, 0, 'u' }, - { "rank", 1, 0, 'r' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "dhm:ur:"; - -static const char* usage_str = - "\n" - "Usage: %s [options...] \n" - "\n" - "Available options:\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -h, --help help message\n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -u, --unmount unmount the filesystem after test\n" - " -r, --rank= only test on rank \n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 0; - struct stat sb = { 0, }; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'd': - debug = 1; - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 'u': - unmount = 1; - break; - - case 'r': - testrank = atoi(optarg); - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (argc - optind != 1) { - print_usage(); - } - - if (testrank > total_ranks - 1) { - test_print(0, "Please specify a valid rank number."); - print_usage(); - } - - filename = argv[optind]; - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - exit(-1); - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (testrank < 0) { /* execute from all ranks in order */ - int i = 0; - - for (i = 0; i < total_ranks; i++) { - if (rank == i) { - do_stat(rank); - } - - MPI_Barrier(MPI_COMM_WORLD); - } - - } else { - if (rank == testrank) { - do_stat(rank); - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (unmount) { - unifyfs_unmount(); - } - - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-truncate.c b/examples/src/sysio-truncate.c deleted file mode 100644 index b757c5072..000000000 --- a/examples/src/sysio-truncate.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -static int rank; -static int total_ranks; -static int debug; - -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static char* filename = "/unifyfs"; -static int unmount; /* unmount unifyfs after running the test */ -static int testrank; -static off_t targetlen; - -static struct option long_opts[] = { - { "debug", 0, 0, 'd' }, - { "help", 0, 0, 'h' }, - { "length", 1, 0, 'l' }, - { "mount", 1, 0, 'm' }, - { "unmount", 0, 0, 'u' }, - { "rank", 1, 0, 'r' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "dhl:m:ur:"; - -static const char* usage_str = - "\n" - "Usage: %s [options...] \n" - "\n" - "Available options:\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -h, --help help message\n" - " -l, --length= truncate the file to \n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -u, --unmount unmount the filesystem after test\n" - " -r, --rank= test on rank (default: 0)\n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 0; - struct stat sb = { 0, }; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'd': - debug = 1; - break; - - case 'l': - targetlen = strtoull(optarg, NULL, 0); - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 'u': - unmount = 1; - break; - - case 'r': - testrank = atoi(optarg); - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (argc - optind != 1) { - print_usage(); - } - - if (testrank > total_ranks - 1) { - test_print(0, "Please specify a valid rank number."); - print_usage(); - } - - filename = argv[optind]; - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - exit(-1); - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (rank == testrank) { - /* try stat the file before truncate */ - ret = stat(filename, &sb); - if (ret < 0) { - test_print(rank, "stat failed on \"%s\"", filename); - } else { - test_print(rank, "## stat before truncate to %llu\n", - (unsigned long long) targetlen); - dump_stat(rank, &sb, filename); - } - - ret = truncate(filename, targetlen); - if (ret < 0) { - test_print(rank, "truncate failed on \"%s\": (errno=%d: %s)", - filename, errno, strerror(errno)); - } - - /* try stat the file again after truncate */ - ret = stat(filename, &sb); - if (ret < 0) { - test_print(rank, "stat failed on \"%s\"", filename); - } else { - test_print(rank, "## stat after truncate to %llu\n", - (unsigned long long) targetlen); - dump_stat(rank, &sb, filename); - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (unmount) { - unifyfs_unmount(); - } - - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-unlink.c b/examples/src/sysio-unlink.c deleted file mode 100644 index 4de9a7d4d..000000000 --- a/examples/src/sysio-unlink.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -static int rank; -static int total_ranks; -static int debug; - -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static char* filename = "/unifyfs"; -static int unmount; /* unmount unifyfs after running the test */ -static int testrank; - -static struct option long_opts[] = { - { "debug", 0, 0, 'd' }, - { "help", 0, 0, 'h' }, - { "mount", 1, 0, 'm' }, - { "unmount", 0, 0, 'u' }, - { "rank", 1, 0, 'r' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "dhm:ur:"; - -static const char* usage_str = - "\n" - "Usage: %s [options...] \n" - "\n" - "Available options:\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -h, --help help message\n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -u, --unmount unmount the filesystem after test\n" - " -r, --rank= test on rank (default: 0)\n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 0; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'd': - debug = 1; - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 'u': - unmount = 1; - break; - - case 'r': - testrank = atoi(optarg); - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (argc - optind != 1) { - print_usage(); - } - - if (testrank > total_ranks - 1) { - test_print(0, "Please specify a valid rank number."); - print_usage(); - } - - filename = argv[optind]; - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - exit(-1); - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (rank == testrank) { - ret = unlink(filename); - if (ret < 0) { - test_print(rank, "unlink failed on \"%s\" (%s)", - filename, strerror(errno)); - } - } - -out: - MPI_Barrier(MPI_COMM_WORLD); - - if (unmount) { - unifyfs_unmount(); - } - - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-write.c b/examples/src/sysio-write.c deleted file mode 100644 index 6ec679dd0..000000000 --- a/examples/src/sysio-write.c +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "testlib.h" - -/* - * I/O test: - * - * Each process will write @blocksize*@nblocks. @chunksize denotes I/O request - * size for each write(2) call. Each block is split into multiple chunks, - * meaning that @blocksize should be larger than and multiple of @chunksize. - */ - -static uint64_t blocksize = 1 << 20; /* 1MB */ -static uint64_t nblocks = 128; /* Each process writes 128MB */ -static uint64_t chunksize = 64 * (1 << 10); /* 64KB for each write(2) call */ - -static int use_pwrite; /* use pwrite(2) */ -static int pattern; /* N to 1 (N1, default) or N to N (NN) */ -static int fd; /* target file descriptor */ -static int synchronous; /* sync metadata for each write? (default: no)*/ - -static int lipsum; /* generate contents to verify correctness */ -static int standard; /* not mounting unifyfs when set */ - -/* time statistics */ -static struct timeval write_start, meta_start, write_end; - -static int rank; -static int total_ranks; - -static int debug; /* pause for attaching debugger */ -static char* buf; /* I/O buffer */ -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static char* filename = "testfile"; /* testfile name under mountpoint */ -static char targetfile[NAME_MAX]; /* target file name */ - -static int do_write(void) -{ - int ret = 0; - uint64_t i, j, offset; - uint64_t nchunks = blocksize / chunksize; - - gettimeofday(&write_start, NULL); - - for (i = 0; i < nblocks; i++) { - for (j = 0; j < nchunks; j++) { - if (pattern == IO_PATTERN_N1) - offset = i * total_ranks * blocksize + rank * blocksize - + j * chunksize; - else { - offset = i * blocksize + j * chunksize; - } - - if (lipsum) { - lipsum_generate(buf, chunksize, offset); - } - - if (use_pwrite) { - ret = pwrite(fd, buf, chunksize, offset); - } else { - lseek(fd, offset, SEEK_SET); - ret = write(fd, buf, chunksize); - } - - if (ret < 0) { - test_print(rank, "%s failed", - use_pwrite ? "pwrite()" : "write()"); - return -1; - } - - if (synchronous) { - fsync(fd); - } - } - } - - gettimeofday(&meta_start, NULL); - - fsync(fd); - - gettimeofday(&write_end, NULL); - - return 0; -} - -static void report_result(void) -{ - double write_bw = .0F; - double agg_write_bw = .0F; - double max_write_time = .0F; - double min_write_bw = .0F; - double write_time = .0F; - double meta_time = .0F; - double max_meta_time = .0F; - - write_time = timediff_sec(&write_start, &write_end); - write_bw = 1.0 * blocksize * nblocks / write_time / (1 << 20); - - meta_time = timediff_sec(&meta_start, &write_end); - - MPI_Reduce(&write_bw, &agg_write_bw, - 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - MPI_Reduce(&write_time, &max_write_time, - 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - MPI_Reduce(&meta_time, &max_meta_time, - 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - - min_write_bw = 1.0 * blocksize * nblocks * total_ranks - / max_write_time / (1 << 20); - - test_print_once(rank, - "\n" - "Number of processes: %d\n" - "Each process wrote: %lf MB\n" - "Total writes: %lf MB\n" - "I/O pattern: %s\n" - "I/O request size: %llu B\n" - "Aggregate write bandwidth: %lf MB/s\n" - "Min. write bandwidth: %lf MB/s\n" - "Total Write time: %lf sec. (%lf for fsync)\n", - total_ranks, - 1.0 * blocksize * nblocks / (1 << 20), - 1.0 * total_ranks * blocksize * nblocks / (1 << 20), - io_pattern_string(pattern), - chunksize, - agg_write_bw, - min_write_bw, - max_write_time, - max_meta_time); -} - -static struct option const long_opts[] = { - { "blocksize", 1, 0, 'b' }, - { "nblocks", 1, 0, 'n' }, - { "chunksize", 1, 0, 'c' }, - { "debug", 0, 0, 'd' }, - { "filename", 1, 0, 'f' }, - { "help", 0, 0, 'h' }, - { "lipsum", 0, 0, 'L' }, - { "mount", 1, 0, 'm' }, - { "pattern", 1, 0, 'p' }, - { "pwrite", 0, 0, 'P' }, - { "synchronous", 0, 0, 'S' }, - { "standard", 0, 0, 's' }, - { 0, 0, 0, 0}, -}; - -static char* short_opts = "b:n:c:df:hlm:p:PSs"; - -static const char* usage_str = - "\n" - "Usage: %s [options...]\n" - "\n" - "Available options:\n" - " -b, --blocksize= logical block size for the target file\n" - " (default 1048576, 1MB)\n" - " -n, --nblocks= count of blocks each process will write\n" - " (default 128)\n" - " -c, --chunksize= I/O chunk size for each write operation\n" - " (default 64436, 64KB)\n" - " -d, --debug pause before running test\n" - " (handy for attaching in debugger)\n" - " -f, --filename= target file name under mountpoint\n" - " (default: testfile)\n" - " -h, --help help message\n" - " -L, --lipsum generate contents to verify correctness\n" - " -m, --mount= use for unifyfs\n" - " (default: /unifyfs)\n" - " -P, --pwrite use pwrite(2) instead of write(2)\n" - " -p, --pattern= should be 'n1'(n to 1) or 'nn' (n to n)\n" - " (default: n1)\n" - " -S, --synchronous sync metadata on each write\n" - " -s, --standard do not use unifyfs but run standard I/O\n" - "\n"; - -static char* program; - -static void print_usage(void) -{ - test_print_once(rank, usage_str, program); - exit(0); -} - -int main(int argc, char** argv) -{ - int ret = 0; - int ch = 0; - int optidx = 2; - - program = basename(strdup(argv[0])); - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((ch = getopt_long(argc, argv, - short_opts, long_opts, &optidx)) >= 0) { - switch (ch) { - case 'b': - blocksize = strtoull(optarg, NULL, 0); - break; - - case 'n': - nblocks = strtoull(optarg, NULL, 0); - break; - - case 'c': - chunksize = strtoull(optarg, NULL, 0); - break; - - case 'f': - filename = strdup(optarg); - break; - - case 'd': - debug = 1; - break; - - case 'P': - use_pwrite = 1; - break; - - case 'p': - pattern = read_io_pattern(optarg); - break; - - case 'L': - lipsum = 1; - break; - - case 'm': - mountpoint = strdup(optarg); - break; - - case 'S': - synchronous = 1; - break; - - case 's': - standard = 1; - break; - - case 'h': - default: - print_usage(); - break; - } - } - - if (pattern < 0) { - test_print_once(rank, "pattern should be 'n1' or 'nn'"); - exit(-1); - } - - if (blocksize < chunksize || blocksize % chunksize > 0) { - test_print_once(rank, "blocksize should be larger than " - "and divisible by chunksize."); - exit(-1); - } - - if (chunksize % (1 << 10) > 0) { - test_print_once(rank, "chunksize and blocksize should be divisible " - "by 1024."); - exit(-1); - } - - if (static_linked(program) && standard) { - test_print_once(rank, "--standard, -s option only works when " - "dynamically linked."); - exit(-1); - } - - sprintf(targetfile, "%s/%s", mountpoint, filename); - - if (debug) { - test_pause(rank, "Attempting to mount"); - } - - if (!standard) { - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - exit(-1); - } - } - - buf = calloc(1, chunksize); - if (!buf) { - test_print(rank, "calloc failed"); - exit(-1); - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (pattern == IO_PATTERN_NN) { - sprintf(&targetfile[strlen(targetfile)], "-%d", rank); - } - - if (rank == 0) { - fd = open(targetfile, O_RDWR | O_CREAT | O_TRUNC, 0600); - if (fd < 0) { - test_print(rank, "open failed"); - exit(-1); - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (rank != 0) { - fd = open(targetfile, O_RDWR, 0600); - if (fd < 0) { - test_print(rank, "open failed"); - exit(-1); - } - } - - ret = do_write(); - - close(fd); - - fflush(stdout); - - MPI_Barrier(MPI_COMM_WORLD); - - /* have rank 0 check the expected file size matches actual file size */ - if (rank == 0) { - /* compute expected size of file after all procs have written, - * each process writes nblocks in groups of nchunks each of - * which is chunksize bytes */ - uint64_t nchunks = blocksize / chunksize; - off_t expected_size = (off_t)nblocks * (off_t)nchunks * - (off_t)chunksize * (off_t)total_ranks; - - /* get stat data for the file */ - errno = 0; - struct stat sbuf; - int stat_rc = stat(targetfile, &sbuf); - if (stat_rc == 0) { - /* check that stat size matches expected size */ - if (sbuf.st_size != expected_size) { - test_print(rank, "%s size incorrect got %llu, expected %llu", - targetfile, (unsigned long long)sbuf.st_size, - (unsigned long long)expected_size); - ret = 1; - } - } else { - /* our call to stat failed */ - test_print(rank, "stat(%s) failed:", - targetfile); - ret = 1; - } - } - - if (!standard) { - unifyfs_unmount(); - } - - if (ret == 0) { - report_result(); - } - - free(buf); - - MPI_Finalize(); - - return ret; -} - diff --git a/examples/src/sysio-writeread.c b/examples/src/sysio-writeread.c deleted file mode 100644 index 9285b0f02..000000000 --- a/examples/src/sysio-writeread.c +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICENSE for full license text. - */ - -/* - * Copyright (c) 2013, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * code Written by - * Raghunath Rajachandrasekar - * Kathryn Mohror - * Adam Moody - * All rights reserved. - * This file is part of CRUISE. - * For details, see https://github.com/hpc/cruise - * Please also read this file LICENSE.CRUISE - */ -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef DISABLE_UNIFYFS -# include -#endif - -#define TEST_STR_LEN 1024 - -struct timeval write_start, write_end; -double write_time; - -struct timeval meta_start; -double meta_time; - -struct timeval read_start, read_end; -double read_time; - -typedef struct { - int fid; - long offset; - long length; - char* buf; -} read_req_t; - -int main(int argc, char* argv[]) -{ - static const char* opts = "b:f:m:n:p:t:u:"; - char tmpfname[TEST_STR_LEN], fname[TEST_STR_LEN], mntpt[TEST_STR_LEN]; - size_t blk_sz = 0, num_blk = 0, tran_sz = 0, num_reqs = 0; - size_t index, i, j, offset = 0; - ssize_t rc; - int ret; - int pat = 0, c, num_rank, rank, fd, use_unifyfs = 0; - - MPI_Init(&argc, &argv); - MPI_Comm_size(MPI_COMM_WORLD, &num_rank); - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - while ((c = getopt(argc, argv, opts)) != -1) { - switch (c) { - case 'b': /*size of block*/ - blk_sz = atol(optarg); - break; - case 'f': - strcpy(fname, optarg); - break; - case 'm': - strcpy(mntpt, optarg); - break; - case 'n': /*number of blocks each process writes*/ - num_blk = atol(optarg); - break; - case 'p': - pat = atoi(optarg); - break; /* 0: N-1 segment/strided, 1: N-N*/ - case 't': /*size of each write */ - tran_sz = atol(optarg); - break; - case 'u': /* use unifyfs */ - use_unifyfs = atoi(optarg); - break; - } - } - - if (use_unifyfs) { - strcpy(mntpt, "/unifyfs"); - } else { - strcpy(mntpt, "/tmp"); - } - - if ((pat < 0) || (pat > 1)) { - printf("unsupported I/O pattern"); - fflush(stdout); - return -1; - } - - if (blk_sz == 0) { - blk_sz = 1048576; /* 1 MiB block size */ - } - - if (num_blk == 0) { - num_blk = 64; /* 64 blocks per process */ - } - - if (tran_sz == 0) { - tran_sz = 32768; /* 32 KiB IO operation size */ - } - - size_t n_tran_per_blk = blk_sz / tran_sz; - double rank_mib = (double)(blk_sz * num_blk) / 1048576; - double total_mib = rank_mib * num_rank; - - char* buf = malloc(tran_sz); - if (buf == NULL) { - return -1; - } - - int byte = (int)'0' + rank; - memset(buf, byte, tran_sz); - -#ifndef DISABLE_UNIFYFS - if (use_unifyfs) { - ret = unifyfs_mount(mntpt, rank, num_rank, 0); - if (0 != ret) { - MPI_Abort(MPI_COMM_WORLD, ret); - } - MPI_Barrier(MPI_COMM_WORLD); - } -#endif - - if (pat == 0) { // N-1 - sprintf(tmpfname, "%s/%s", mntpt, fname); - } else { // N-N - sprintf(tmpfname, "%s/%s%d", mntpt, fname, rank); - } - - int open_flags = O_CREAT | O_RDWR; - fd = open(tmpfname, open_flags, 0644); - if (fd < 0) { - printf("open file failure\n"); - fflush(stdout); - return -1; - } - - MPI_Barrier(MPI_COMM_WORLD); - gettimeofday(&write_start, NULL); - - for (i = 0; i < num_blk; i++) { - for (j = 0; j < n_tran_per_blk; j++) { - if (pat == 0) - offset = (i * blk_sz * num_rank) - + (rank * blk_sz) + (j * tran_sz); - else { - offset = (i * blk_sz) + (j * tran_sz); - } - rc = pwrite(fd, buf, tran_sz, offset); - - if (rc < 0) { - perror("pwrite failed"); - } - } - } - - gettimeofday(&meta_start, NULL); - fsync(fd); - - gettimeofday(&write_end, NULL); - free(buf); - - meta_time += 1000000 * (write_end.tv_sec - meta_start.tv_sec) - + write_end.tv_usec - meta_start.tv_usec; - meta_time /= 1000000; - - write_time += 1000000 * (write_end.tv_sec - write_start.tv_sec) - + write_end.tv_usec - write_start.tv_usec; - write_time = write_time / 1000000; - - double agg_write_bw, max_write_time; - double write_bw = rank_mib / write_time; - MPI_Reduce(&write_bw, &agg_write_bw, 1, MPI_DOUBLE, MPI_SUM, - 0, MPI_COMM_WORLD); - MPI_Reduce(&write_time, &max_write_time, 1, MPI_DOUBLE, MPI_MAX, - 0, MPI_COMM_WORLD); - double min_write_bw = total_mib / max_write_time; - - if (rank == 0) { - printf("Aggregate Write BW is %.3lf MiB/s\n" - " Minimum Write BW is %.3lf MiB/s\n\n", - agg_write_bw, min_write_bw); - fflush(stdout); - } - - /* read buffer */ - char* read_buf = calloc(num_blk, blk_sz); - - /* list of read requests for lio_listio */ - num_reqs = num_blk * n_tran_per_blk; - - struct aiocb* aiocb_list = (struct aiocb*) calloc(num_reqs, - sizeof(struct aiocb)); - - struct aiocb** cb_list = (struct aiocb**) calloc(num_reqs, - sizeof(struct aiocb*)); - - if ((read_buf == NULL) || (aiocb_list == NULL) || (cb_list == NULL)) { - return -1; - } - - index = 0; - - if (pat == 0) { // N-1 - for (i = 0; i < num_blk; i++) { - for (j = 0; j < n_tran_per_blk; j++) { - aiocb_list[index].aio_fildes = fd; - aiocb_list[index].aio_buf = read_buf + (index * tran_sz); - aiocb_list[index].aio_nbytes = tran_sz; - aiocb_list[index].aio_offset = (i * blk_sz * num_rank) - + (rank * blk_sz) - + (j * tran_sz); - aiocb_list[index].aio_lio_opcode = LIO_READ; - cb_list[index] = &aiocb_list[index]; - index++; - } - } - } else { // N-N - for (i = 0; i < num_blk; i++) { - for (j = 0; j < n_tran_per_blk; j++) { - aiocb_list[index].aio_fildes = fd; - aiocb_list[index].aio_buf = read_buf + (index * tran_sz); - aiocb_list[index].aio_nbytes = tran_sz; - aiocb_list[index].aio_offset = (i * blk_sz) + (j * tran_sz); - aiocb_list[index].aio_lio_opcode = LIO_READ; - cb_list[index] = &aiocb_list[index]; - index++; - } - } - } - - MPI_Barrier(MPI_COMM_WORLD); - gettimeofday(&read_start, NULL); - - ret = lio_listio(LIO_WAIT, cb_list, num_reqs, NULL); - if (ret < 0) { - perror("lio_listio failed"); - } - - gettimeofday(&read_end, NULL); - - close(fd); - free(read_buf); - - read_time = (read_end.tv_sec - read_start.tv_sec) * 1000000 - + read_end.tv_usec - read_start.tv_usec; - read_time = read_time / 1000000; - - double agg_read_bw, max_read_time; - double read_bw = rank_mib / read_time; - MPI_Reduce(&read_bw, &agg_read_bw, 1, MPI_DOUBLE, MPI_SUM, - 0, MPI_COMM_WORLD); - MPI_Reduce(&read_time, &max_read_time, 1, MPI_DOUBLE, MPI_MAX, - 0, MPI_COMM_WORLD); - double min_read_bw = total_mib / max_read_time; - - if (rank == 0) { - printf("Aggregate Read BW is %.3lf MiB/s\n" - " Minimum Read BW is %.3lf MiB/s\n\n", - agg_read_bw, min_read_bw); - fflush(stdout); - } - -#ifndef DISABLE_UNIFYFS - if (use_unifyfs) { - unifyfs_unmount(); - } -#endif - - MPI_Finalize(); - - return 0; -} diff --git a/examples/src/sysio-writeread2.c b/examples/src/sysio-writeread2.c deleted file mode 100644 index 76cad03a5..000000000 --- a/examples/src/sysio-writeread2.c +++ /dev/null @@ -1,414 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -// build: mpigcc -g -O3 -o test_ramdisk test_ramdisk.c -// run: srun -n64 -N4 ./test_ramdisk - -#include - -#define _GNU_SOURCE 1 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//size_t filesize = 100*1024*1024; -size_t filesize = 1024*1024; -int times = 5; -int seconds; -int rank = -1; -int ranks; - -/* reliable read from file descriptor - * (retries, if necessary, until hard error) - */ -int reliable_read(int fd, void *buf, size_t size) -{ - size_t n = 0; - int retries = 10; - int rank; - char host[128]; - - while (n < size) { - int rc = read(fd, (char *) buf + n, size - n); - - if (rc > 0) - n += rc; - else if (rc == 0) - return n; /* EOF */ - else { /* (rc < 0) */ - /* got an error, check whether it was serious */ - if (errno == EINTR || errno == EAGAIN) - continue; - - /* something worth printing an error about */ - retries--; - - if (retries) { - /* print an error and try again */ - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - gethostname(host, sizeof(host)); - printf("%d on %s: ERROR: Error reading: " - "read(%d, %p, %ld) errno=%d %s @ %s:%d\n", - rank, host, fd, (char *) buf + n, size - n, - errno, strerror(errno), __FILE__, __LINE__); - } else { - /* too many failed retries, give up */ - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - gethostname(host, sizeof(host)); - printf("%d on %s: ERROR: Giving up read: " - "read(%d, %p, %ld) errno=%d %s @ %s:%d\n", - rank, host, fd, (char *) buf + n, size - n, - errno, strerror(errno), __FILE__, __LINE__); - MPI_Abort(MPI_COMM_WORLD, 0); - } - } - } - return size; -} - -/* reliable write to file descriptor (retries, if necessary, until hard error) - */ -int reliable_write(int fd, const void *buf, size_t size) -{ - size_t n = 0; - int retries = 10; - int rank; - char host[128]; - - while (n < size) { - int rc = write(fd, (char *) buf + n, size - n); - - if (rc > 0) - n += rc; - else if (rc == 0) { - /* something bad happened, print an error and abort */ - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - gethostname(host, sizeof(host)); - printf("%d on %s: ERROR: Error writing: " - "write(%d, %p, %ld) returned 0 @ %s:%d\n", - rank, host, fd, (char *) buf + n, size - n, - __FILE__, __LINE__); - MPI_Abort(MPI_COMM_WORLD, 0); - } else { /* (rc < 0) */ - /* got an error, check whether it was serious */ - if (errno == EINTR || errno == EAGAIN) - continue; - - /* something worth printing an error about */ - retries--; - - if (retries) { - /* print an error and try again */ - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - gethostname(host, sizeof(host)); - printf("%d on %s: ERROR: Error writing: " - "write(%d, %p, %ld) errno=%d %s @ %s:%d\n", - rank, host, fd, (char *) buf + n, size - n, - errno, strerror(errno), __FILE__, __LINE__); - } else { - /* too many failed retries, give up */ - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - gethostname(host, sizeof(host)); - printf("%d on %s: ERROR: Giving up write: " - "write(%d, %p, %ld) errno=%d %s @ %s:%d\n", - rank, host, fd, (char *) buf + n, size - n, - errno, strerror(errno), __FILE__, __LINE__); - MPI_Abort(MPI_COMM_WORLD, 0); - } - } - } - return size; -} - -/* initialize buffer with some well-known value based on rank */ -int init_buffer(char *buf, size_t size, int rank, int ckpt) -{ - size_t i; - - for (i = 0; i < size; i++) { - char c = 'a' + (char)((rank + ckpt + i) & 32); - - buf[i] = c; - } - - return 0; -} - -/* checks buffer for expected value */ -int check_buffer(char *buf, size_t size, int rank, int ckpt) -{ - size_t i; - - for (i = 0; i < size; i++) { - char c = 'a' + (char)((rank + ckpt + i) & 32); - - if (buf[i] != c) { - printf("check failed at byte %d, should be %c is %c\n", - (int) i, c, buf[i]); - return 0; - } - } - - return 1; -} - -/* read the checkpoint data from file into buf, and return whether the read was - * successful - */ -int read_checkpoint(int fd, int *rank, int *ckpt, char *buf, size_t size) -{ - int ret; - unsigned long n; - char rank_buf[7]; - char ckpt_buf[7]; - size_t field_size = 6; - - /* read the rank id */ - n = reliable_read(fd, rank_buf, field_size); - if (n != field_size) { - printf("Failed to read rank\n"); - return 0; - } - rank_buf[6] = '\0'; - - /* read the checkpoint id */ - n = reliable_read(fd, ckpt_buf, field_size); - if (n != field_size) { - printf("Failed to read timestep\n"); - return 0; - } - ckpt_buf[6] = '\0'; - - /* read the checkpoint data, and check the file size */ - n = reliable_read(fd, buf, size+1); - if (n != size) { - printf("Filesize not correct\n"); - return 0; - } - - /* if the file looks good, set the timestep and return */ - ret = sscanf(rank_buf, "%6d", rank); - ret = sscanf(ckpt_buf, "%6d", ckpt); - - if (ret == EOF) - perror("sscanf"); - - return 0; -} - -/* write the checkpoint data to fd, and return whether the write was successful - */ -int write_checkpoint(int fd, int rank, int ckpt, char *buf, size_t size) -{ - int rc; - int valid = 0; - char rank_buf[7]; - char ckpt_buf[7]; - size_t field_size = 6; - - /* write the rank id */ - sprintf(rank_buf, "%06d", rank); - rc = reliable_write(fd, rank_buf, field_size); - if (rc < 0) - valid = 0; - - /* write the checkpoint id (application timestep) */ - sprintf(ckpt_buf, "%06d", ckpt); - rc = reliable_write(fd, ckpt_buf, field_size); - if (rc < 0) - valid = 0; - - /* write the checkpoint data */ - rc = reliable_write(fd, buf, size); - if (rc < 0) - valid = 0; - - return valid; -} - -void checkdata(char *file, size_t size, int times) -{ - char *buf = malloc(size); - - MPI_Barrier(MPI_COMM_WORLD); - - if (times > 0) { - /* write the checkpoint file */ - int i; - - for (i = 0; i < times; i++) { - int rc; - int valid = 0; - - rc = init_buffer(buf, size, rank, i); - - if (rank == 0) { - printf("Writing checkpoint %d.\n", i); - fflush(stdout); - } - - /* open the file and write the checkpoint */ - int fd_me = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0600); - - if (fd_me > 0) { - valid = 1; - - /* write the checkpoint data */ - rc = write_checkpoint(fd_me, rank, i, buf, size); - if (rc < 0) - valid = 0; - - /* force the data to storage */ - rc = fsync(fd_me); - if (rc < 0) - valid = 0; - - /* make sure the close is without error */ - rc = close(fd_me); - if (rc < 0) - valid = 0; - } - - if (!valid) { - printf("failed to write checkpoint\n"); - continue; - } - - if (rank == 0) { - printf("Completed checkpoint %d.\n", i); - fflush(stdout); - } - - if (rank == 0) { - printf("Reading checkpoint %d.\n", i); - fflush(stdout); - } - - memset(buf, 0, size); - - /* open the file and write the checkpoint */ - int read_rank, read_timestep; - - fd_me = open(file, O_RDONLY); - if (fd_me > 0) { - valid = 1; - - /* write the checkpoint data */ - rc = read_checkpoint(fd_me, &read_rank, &read_timestep, buf, - size); - if (rc < 0) - valid = 0; - - /* make sure the close is without error */ - rc = close(fd_me); - if (rc < 0) - valid = 0; - } - - if (!valid) { - printf("failed to read checkpoint"); - continue; - } - - if (read_rank != rank || read_timestep != i) { - printf("INVALID HEADER on rank %d in step %d\n", rank, i); - fflush(stdout); - - MPI_Abort(MPI_COMM_WORLD, 0); - } - - rc = check_buffer(buf, size, rank, i); - if (!rc) { - printf("INVALID DATA on rank %d in step %d\n", rank, i); - fflush(stdout); - - MPI_Abort(MPI_COMM_WORLD, 0); - } - - if (rank == 0) { - printf("Verified checkpoint %d.\n", read_timestep); - fflush(stdout); - } - - /* optionally sleep for some time */ - if (seconds > 0) { - if (rank == 0) { - printf("Sleeping for %d seconds...\n", seconds); - fflush(stdout); - } - sleep(seconds); - } - - unlink(file); - } - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (buf != NULL) { - free(buf); - buf = NULL; - } - - return; -} - -int main(int argc, char *argv[]) -{ - /* check that we got an appropriate number of arguments */ - if (argc != 1 && argc != 4) { - printf("Usage: test_correctness [filesize times sleep_secs]\n"); - return 1; - } - - /* read parameters from command line, if any */ - if (argc > 1) { - filesize = (size_t) atol(argv[1]); - times = atoi(argv[2]); - seconds = atoi(argv[3]); - } - - MPI_Init(&argc, &argv); - - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &ranks); - - unifyfs_mount("/unifyfs", rank, ranks, 0); - - char name[256]; - - sprintf(name, "/unifyfs/rank.%d", rank); - - /* allocate space for the checkpoint data (make filesize a function of rank - * for some variation) - */ - filesize = filesize + rank; - - /* verify data integrity in file */ - checkdata(name, filesize, times); - - unifyfs_unmount(); - - MPI_Finalize(); - - return 0; -} diff --git a/m4/gotcha.m4 b/m4/gotcha.m4 index 54e09a5db..4baf3dd2b 100644 --- a/m4/gotcha.m4 +++ b/m4/gotcha.m4 @@ -5,20 +5,30 @@ AC_DEFUN([UNIFYFS_AC_GOTCHA], [ GOTCHA_OLD_LDFLAGS=$LDFLAGS AC_ARG_WITH([gotcha], [AC_HELP_STRING([--with-gotcha=PATH], - [path to installed libgotcha [default=/usr/local]])], [ - GOTCHA_CFLAGS="-I${withval}/include" - GOTCHA_LDFLAGS="-L${withval}/lib64 -L${withval}/lib" - CFLAGS="$CFLAGS ${GOTCHA_CFLAGS}" - CXXFLAGS="$CXXFLAGS ${GOTCHA_CFLAGS}" - LDFLAGS="$LDFLAGS ${GOTCHA_LDFLAGS}" - ], []) + [path to installed libgotcha [default=/usr/local]])], + [ + GOTCHA_DIR="${withval}" + GOTCHA_CFLAGS="-I${GOTCHA_DIR}/include" + GOTCHA_LDFLAGS="-L${GOTCHA_DIR}/lib64 -L${GOTCHA_DIR}/lib -Wl,-rpath,${GOTCHA_DIR}/lib64 -Wl,-rpath,${GOTCHA_DIR}/lib" + CFLAGS="$CFLAGS ${GOTCHA_CFLAGS}" + CXXFLAGS="$CXXFLAGS ${GOTCHA_CFLAGS}" + LDFLAGS="$LDFLAGS ${GOTCHA_LDFLAGS}" + ], + [ + GOTCHA_CFLAGS="" + GOTCHA_LDFLAGS="" + ] + ) AC_CHECK_LIB([gotcha], [gotcha_wrap], [ + GOTCHA_LIBS="${GOTCHA_LDFLAGS} -lgotcha" AC_SUBST(GOTCHA_CFLAGS) AC_SUBST(GOTCHA_LDFLAGS) + AC_SUBST(GOTCHA_LIBS) AM_CONDITIONAL([HAVE_GOTCHA], [true]) - ],[ + ], + [ AC_MSG_WARN([couldn't find a suitable libgotcha, use --with-gotcha=PATH]) AM_CONDITIONAL([HAVE_GOTCHA], [false]) ], diff --git a/m4/spath.m4 b/m4/spath.m4 index 9d0abd882..5a4070502 100644 --- a/m4/spath.m4 +++ b/m4/spath.m4 @@ -1,25 +1,43 @@ AC_DEFUN([UNIFYFS_AC_SPATH], [ + # preserve state of flags + SPATH_OLD_CFLAGS=$CFLAGS + SPATH_OLD_CXXFLAGS=$CXXFLAGS + SPATH_OLD_LDFLAGS=$LDFLAGS - AC_ARG_WITH([spath], [AS_HELP_STRING([--with-spath=PATH], + AC_ARG_WITH([spath], [AC_HELP_STRING([--with-spath=PATH], [path to installed libspath [default=/usr/local]])], - [], - [with_spath=no] - ) - - AS_IF([test x$with_spath != xno], [ - CPPFLAGS="-I${withval}/include ${CPPFLAGS}" - LDFLAGS="-L${withval}/lib -L${withval}/lib64 ${LDFLAGS}" + SPATH_DIR="${withval}" + SPATH_CFLAGS="-I${SPATH_DIR}/include" + SPATH_LDFLAGS="-L${SPATH_DIR}/lib64 -L${SPATH_DIR}/lib -Wl,-rpath,${SPATH_DIR}/lib64 -Wl,-rpath,${SPATH_DIR}/lib" + CFLAGS="$CFLAGS ${SPATH_CFLAGS}" + CXXFLAGS="$CXXFLAGS ${SPATH_CFLAGS}" + LDFLAGS="$LDFLAGS ${SPATH_LDFLAGS}" + ], + [ + SPATH_CFLAGS="" + SPATH_LDFLAGS="" ] ) AC_CHECK_LIB([spath], [spath_strdup_reduce_str], [ - LIBS="$LIBS -lspath" - AC_DEFINE([HAVE_SPATH], [1], [Defined if you have spath]) - ],[ - AC_MSG_WARN([couldn't find a suitable libspath]) + SPATH_LIBS="${SPATH_LDFLAGS} -lspath" + AC_SUBST(SPATH_CFLAGS) + AC_SUBST(SPATH_LDFLAGS) + AC_SUBST(SPATH_LIBS) + AM_CONDITIONAL([HAVE_SPATH], [true]) + AC_DEFINE([USE_SPATH], [1], [Defined if you have libspath]) + ], + [ + AC_MSG_WARN([couldn't find a suitable libspath, use --with-spath=PATH]) + AM_CONDITIONAL([HAVE_SPATH], [false]) ], [] ) + + # restore flags + CFLAGS=$SPATH_OLD_CFLAGS + CXXFLAGS=$SPATH_OLD_CXXFLAGS + LDFLAGS=$SPATH_OLD_LDFLAGS ]) diff --git a/t/Makefile.am b/t/Makefile.am index 674b4755e..fdf35aa0d 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -4,15 +4,14 @@ TEST_EXTENSIONS = .t T_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) $(top_srcdir)/t/tap-driver.sh # Order matters -TESTS = \ - 0001-setup.t +TESTS = 0001-setup.t if HAVE_GOTCHA - TESTS += \ - 0100-sysio-gotcha.t \ - 0110-statfs-gotcha.t \ - 0200-stdio-gotcha.t -endif +TESTS += \ + 0100-sysio-gotcha.t \ + 0110-statfs-gotcha.t \ + 0200-stdio-gotcha.t +endif #HAVE_GOTCHA TESTS += \ 0500-sysio-static.t \ @@ -28,27 +27,7 @@ TESTS += \ 9300-unifyfs-stage-isolated.t \ 9999-cleanup.t -check_SCRIPTS = \ - 0001-setup.t \ - 0500-sysio-static.t \ - 0510-statfs-static.t \ - 0600-stdio-static.t \ - 0700-unifyfs-stage-full.t \ - 8000-client-api.t \ - 9005-unifyfs-unmount.t \ - 9010-stop-unifyfsd.t \ - 9020-mountpoint-empty.t \ - 9200-seg-tree-test.t \ - 9201-slotmap-test.t \ - 9300-unifyfs-stage-isolated.t \ - 9999-cleanup.t - -if HAVE_GOTCHA - check_SCRIPTS += \ - 0100-sysio-gotcha.t \ - 0110-statfs-gotcha.t \ - 0200-stdio-gotcha.t -endif +check_SCRIPTS = $(TESTS) EXTRA_DIST = \ $(check_SCRIPTS) \ @@ -71,67 +50,67 @@ libexec_PROGRAMS = \ unifyfs_unmount.t if HAVE_GOTCHA - libexec_PROGRAMS += \ - std/stdio-gotcha.t \ - sys/statfs-gotcha.t \ - sys/sysio-gotcha.t +libexec_PROGRAMS += \ + std/stdio-gotcha.t \ + sys/statfs-gotcha.t \ + sys/sysio-gotcha.t endif +# Compile/link flag definitions + +test_cppflags = \ + -I$(top_srcdir) \ + -I$(top_srcdir)/client/src \ + -I$(top_srcdir)/common/src \ + -D_GNU_SOURCE \ + $(AM_CPPFLAGS) \ + $(MPI_CFLAGS) + +test_ldadd = \ + $(top_builddir)/t/lib/libtap.la \ + $(top_builddir)/t/lib/libtestutil.la \ + $(MPI_CLDFLAGS) \ + -lpthread + +# flags for api tests test_api_ldadd = \ $(top_builddir)/t/lib/libtap.la \ $(top_builddir)/t/lib/libtestutil.la \ $(top_builddir)/client/src/libunifyfs_api.la -test_api_ldflags = \ +test_api_ldflags = \ $(AM_LDFLAGS) \ - $(MPI_CLDFLAGS) \ $(CP_WRAPPERS) \ -static -test_common_ldadd = \ - $(top_builddir)/t/lib/libtap.la \ - $(top_builddir)/t/lib/libtestutil.la +# flags for common tests +test_common_ldadd = $(test_ldadd) test_common_ldflags = \ $(AM_LDFLAGS) \ - -static -lpthread + -static +# flags for gotcha wrap tests test_gotcha_ldadd = \ - $(top_builddir)/t/lib/libtap.la \ - $(top_builddir)/t/lib/libtestutil.la \ + $(test_ldadd) \ $(top_builddir)/client/src/libunifyfs_gotcha.la -test_gotcha_ldflags = \ - $(AM_LDFLAGS)\ - $(MPI_CLDFLAGS) - +# flags for linker wrap tests test_wrap_ldadd = \ - $(top_builddir)/t/lib/libtap.la \ - $(top_builddir)/t/lib/libtestutil.la \ + $(test_ldadd) \ $(top_builddir)/client/src/libunifyfs.la test_wrap_ldflags = \ $(AM_LDFLAGS) \ - $(MPI_CLDFLAGS) \ $(CP_WRAPPERS) \ -static -test_common_cppflags = \ - -I$(top_srcdir) \ - -I$(top_srcdir)/common/src \ - -D_GNU_SOURCE \ - $(AM_CPPFLAGS) - -test_cppflags = \ - -I$(top_srcdir) \ - -I$(top_srcdir)/client/src \ - -I$(top_srcdir)/common/src \ - -D_GNU_SOURCE \ - $(AM_CPPFLAGS) \ - $(MPI_CFLAGS) - +# Per-target flags begin here -api_client_api_test_t_SOURCES = \ +api_client_api_test_t_CPPFLAGS = $(test_cppflags) +api_client_api_test_t_LDADD = $(test_api_ldadd) +api_client_api_test_t_LDFLAGS = $(test_api_ldflags) +api_client_api_test_t_SOURCES = \ api/client_api_suite.h \ api/client_api_suite.c \ api/init-fini.c \ @@ -139,11 +118,7 @@ api_client_api_test_t_SOURCES = \ api/write-read-sync-stat.c \ api/laminate.c -api_client_api_test_t_CPPFLAGS = $(test_cppflags) -api_client_api_test_t_LDADD = $(test_api_ldadd) -api_client_api_test_t_LDFLAGS = $(test_api_ldflags) - -sys_sysio_gotcha_t_SOURCES = \ +test_sysio_sources = \ sys/sysio_suite.h \ sys/sysio_suite.c \ sys/statfs.c \ @@ -160,48 +135,30 @@ sys_sysio_gotcha_t_SOURCES = \ sys/chdir.c sys_sysio_gotcha_t_CPPFLAGS = $(test_cppflags) -sys_sysio_gotcha_t_LDADD = $(test_gotcha_ldadd) -sys_sysio_gotcha_t_LDFLAGS = $(test_gotcha_ldflags) - -sys_sysio_static_t_SOURCES = \ - sys/sysio_suite.h \ - sys/sysio_suite.c \ - sys/statfs.c \ - sys/creat-close.c \ - sys/creat64.c \ - sys/mkdir-rmdir.c \ - sys/open.c \ - sys/open64.c \ - sys/lseek.c \ - sys/write-read.c \ - sys/write-read-hole.c \ - sys/truncate.c \ - sys/unlink.c \ - sys/chdir.c +sys_sysio_gotcha_t_LDADD = $(test_gotcha_ldadd) +sys_sysio_gotcha_t_SOURCES = $(test_sysio_sources) sys_sysio_static_t_CPPFLAGS = $(test_cppflags) -sys_sysio_static_t_LDADD = $(test_wrap_ldadd) -sys_sysio_static_t_LDFLAGS = $(test_wrap_ldflags) +sys_sysio_static_t_LDADD = $(test_wrap_ldadd) +sys_sysio_static_t_LDFLAGS = $(test_wrap_ldflags) +sys_sysio_static_t_SOURCES = $(test_sysio_sources) -sys_statfs_gotcha_t_SOURCES = \ +test_statfs_sources = \ sys/statfs_suite.h \ sys/statfs_suite.c \ sys/statfs.c sys_statfs_gotcha_t_CPPFLAGS = $(test_cppflags) -sys_statfs_gotcha_t_LDADD = $(test_gotcha_ldadd) -sys_statfs_gotcha_t_LDFLAGS = $(test_gotcha_ldflags) - -sys_statfs_static_t_SOURCES = \ - sys/statfs_suite.h \ - sys/statfs_suite.c \ - sys/statfs.c +sys_statfs_gotcha_t_LDADD = $(test_gotcha_ldadd) +sys_statfs_gotcha_t_SOURCES = $(test_statfs_sources) sys_statfs_static_t_CPPFLAGS = $(test_cppflags) -sys_statfs_static_t_LDADD = $(test_wrap_ldadd) -sys_statfs_static_t_LDFLAGS = $(test_wrap_ldflags) +sys_statfs_static_t_LDADD = $(test_wrap_ldadd) +sys_statfs_static_t_LDFLAGS = $(test_wrap_ldflags) +sys_statfs_static_t_SOURCES = $(test_statfs_sources) -std_stdio_gotcha_t_SOURCES = \ + +test_stdio_sources = \ std/stdio_suite.h \ std/stdio_suite.c \ std/fopen-fclose.c \ @@ -211,39 +168,31 @@ std_stdio_gotcha_t_SOURCES = \ std/size.c std_stdio_gotcha_t_CPPFLAGS = $(test_cppflags) -std_stdio_gotcha_t_LDADD = $(test_gotcha_ldadd) -std_stdio_gotcha_t_LDFLAGS = $(test_gotcha_ldflags) - -std_stdio_static_t_SOURCES = \ - std/stdio_suite.h \ - std/stdio_suite.c \ - std/fopen-fclose.c \ - std/fseek-ftell.c \ - std/fwrite-fread.c \ - std/fflush.c \ - std/size.c +std_stdio_gotcha_t_LDADD = $(test_gotcha_ldadd) +std_stdio_gotcha_t_SOURCES = $(test_stdio_sources) std_stdio_static_t_CPPFLAGS = $(test_cppflags) -std_stdio_static_t_LDADD = $(test_wrap_ldadd) -std_stdio_static_t_LDFLAGS = $(test_wrap_ldflags) +std_stdio_static_t_LDADD = $(test_wrap_ldadd) +std_stdio_static_t_LDFLAGS = $(test_wrap_ldflags) +std_stdio_static_t_SOURCES = $(test_stdio_sources) -unifyfs_unmount_t_SOURCES = unifyfs_unmount.c unifyfs_unmount_t_CPPFLAGS = $(test_cppflags) -unifyfs_unmount_t_LDADD = $(test_wrap_ldadd) -unifyfs_unmount_t_LDFLAGS = $(test_wrap_ldflags) - -common_seg_tree_test_t_SOURCES = \ +unifyfs_unmount_t_LDADD = $(test_wrap_ldadd) +unifyfs_unmount_t_LDFLAGS = $(test_wrap_ldflags) +unifyfs_unmount_t_SOURCES = unifyfs_unmount.c + +common_seg_tree_test_t_CPPFLAGS = $(test_cppflags) +common_seg_tree_test_t_LDADD = $(test_common_ldadd) +common_seg_tree_test_t_LDFLAGS = $(test_common_ldflags) +common_seg_tree_test_t_SOURCES = \ common/seg_tree_test.c \ ../common/src/seg_tree.c \ ../common/src/unifyfs_log.c \ ../common/src/unifyfs_misc.c -common_seg_tree_test_t_CPPFLAGS = $(test_common_cppflags) -common_seg_tree_test_t_LDADD = $(test_common_ldadd) -common_seg_tree_test_t_LDFLAGS = $(test_common_ldflags) -common_slotmap_test_t_SOURCES = \ +common_slotmap_test_t_CPPFLAGS = $(test_cppflags) +common_slotmap_test_t_LDADD = $(test_common_ldadd) +common_slotmap_test_t_LDFLAGS = $(test_common_ldflags) +common_slotmap_test_t_SOURCES = \ common/slotmap_test.c \ ../common/src/slotmap.c -common_slotmap_test_t_CPPFLAGS = $(test_common_cppflags) -common_slotmap_test_t_LDADD = $(test_common_ldadd) -common_slotmap_test_t_LDFLAGS = $(test_common_ldflags) diff --git a/t/sys/chdir.c b/t/sys/chdir.c index dc1f8e581..6b931e554 100644 --- a/t/sys/chdir.c +++ b/t/sys/chdir.c @@ -25,10 +25,12 @@ #include #include #include -#include "config.h" + #include "t/lib/tap.h" #include "t/lib/testutil.h" +#include "config.h" // for USE_SPATH + int chdir_test(char* unifyfs_root) { diag("Starting UNIFYFS_WRAP(chdir/fchdir/getcwd/getwd/" @@ -164,7 +166,7 @@ int chdir_test(char* unifyfs_root) "%s:%d getcwd returned %s expected %s", __FILE__, __LINE__, str, buf); -#ifdef HAVE_SPATH +#ifdef USE_SPATH /* change back to root unifyfs directory */ errno = 0; rc = chdir(".."); @@ -218,7 +220,7 @@ int chdir_test(char* unifyfs_root) #else skip(1, 9, "test requires missing spath dependency"); end_skip; -#endif /* HAVE_SPATH */ +#endif /* USE_SPATH */ /* TODO: Some compilers throw a warning/error if one uses getwd(). * For those compilers that allow it, it would be nice to execute @@ -353,7 +355,7 @@ int chdir_test(char* unifyfs_root) free(str); } -#ifdef HAVE_SPATH +#ifdef USE_SPATH /* change back to root unifyfs directory */ errno = 0; rc = chdir(".."); @@ -396,7 +398,7 @@ int chdir_test(char* unifyfs_root) #else skip(1, 6, "test requires missing spath dependency"); end_skip; -#endif /* HAVE_SPATH */ +#endif /* USE_SPATH */ /* TODO: Our directory wrappers are not fully functioning yet, diff --git a/util/unifyfs-stage/src/Makefile.am b/util/unifyfs-stage/src/Makefile.am index a62660884..494909682 100644 --- a/util/unifyfs-stage/src/Makefile.am +++ b/util/unifyfs-stage/src/Makefile.am @@ -1,10 +1,13 @@ libexec_PROGRAMS = unifyfs-stage -unifyfs_stage_SOURCES = unifyfs-stage.c \ - unifyfs-stage-transfer.c - noinst_HEADERS = unifyfs-stage.h +CLEANFILES = $(libexec_PROGRAMS) + +# Compiler/linker flags + +AM_CFLAGS = -Wall -Werror + stage_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ $(OPENSSL_CFLAGS) \ -I$(top_srcdir)/client/src \ @@ -12,15 +15,31 @@ stage_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ if USE_PMPI_WRAPPERS stage_cppflags += -DENABLE_MPI_MOUNT +stage_unify_lib = $(top_builddir)/client/src/libunifyfs_mpi.la +else +stage_unify_lib = $(top_builddir)/client/src/libunifyfs.la endif -unifyfs_stage_CPPFLAGS = $(stage_cppflags) +stage_ldadd = \ + $(stage_unify_lib) \ + -lrt -lm \ + $(OPENSSL_LIBS) \ + $(MPI_CLDFLAGS) -unifyfs_stage_LDADD = $(top_builddir)/client/src/libunifyfs.la -lrt -lm +stage_ldflags = \ + $(AM_LDFLAGS) \ + $(CP_WRAPPERS) \ + -static -unifyfs_stage_LDFLAGS = -static $(CP_WRAPPERS) $(AM_LDFLAGS) \ - $(MPI_CLDFLAGS) $(OPENSSL_LIBS) +# Per-target flags begin here + +stage_sources = \ + unifyfs-stage.c \ + unifyfs-stage-transfer.c + +unifyfs_stage_CPPFLAGS = $(stage_cppflags) +unifyfs_stage_LDADD = $(stage_ldadd) +unifyfs_stage_LDFLAGS = $(stage_ldflags) +unifyfs_stage_SOURCES = $(stage_sources) -AM_CFLAGS = -Wall -Werror -CLEANFILES = $(libexec_PROGRAMS) From c8b36b021930b9083c8599581e48e3d62a7b2083 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Fri, 11 Jun 2021 10:54:34 -0400 Subject: [PATCH 11/81] Cray compiler wrapper support for lx_find_mpi.m4 TEST_CHECKPATCH_SKIP_FILES=docs/build.rst --- docs/build.rst | 6 ++++++ m4/lx_find_mpi.m4 | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/docs/build.rst b/docs/build.rst index b52ac21bf..709eab3c9 100644 --- a/docs/build.rst +++ b/docs/build.rst @@ -210,6 +210,12 @@ Alternatively, UnifyFS can be configured using ``--with`` options: To see all available build configuration options, run ``./configure --help`` after ``./autogen.sh`` has been run. + +.. note:: + + On Cray systems, the detection of MPI compiler wrappers requires passing the + following flags to the configure command: ``MPICC=cc MPIFC=ftn`` + --------------------------- ----------------- diff --git a/m4/lx_find_mpi.m4 b/m4/lx_find_mpi.m4 index 1e1c84b20..2631d2657 100644 --- a/m4/lx_find_mpi.m4 +++ b/m4/lx_find_mpi.m4 @@ -118,6 +118,13 @@ AC_DEFUN([LX_QUERY_MPI_COMPILER], echo yes else echo no + echo -n "Checking whether $$1 responds to '--cray-print-opts=all'... " + lx_mpi_command_line=`$$1 --cray-print-opts=all 2>/dev/null` + if [[ "$?" -eq 0 ]]; then + echo yes + else + echo no + fi fi fi else From 164b5f07666057eea065804dd7eb8446212200bd Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Tue, 23 Feb 2021 17:16:44 -0500 Subject: [PATCH 12/81] offload server-server request processing to svcmgr Also: * avoid laminate brodcast for private files * allocate extent_tree when inode is allocated * disable dispatcher signaling that should not be necessary * use max bulk size in client-server transfers * only broadcast truncate when new size is smaller * avoid metaget storm on file open by using server cached attributes * build fix for --enable-pmix * create mountpoint dir during new app init * support margo_lazy_connect to speed bootstrap * remove racy client read pthread condition signal/wait * add svcmgr progress alarm to detect hangs * fix unifyfs start/terminate on SLURM TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.h" --- client/src/client_read.c | 75 +- client/src/client_read.h | 11 +- client/src/margo_client.c | 46 +- client/src/unifyfs-dirops.c | 2 +- client/src/unifyfs-internal.h | 7 +- client/src/unifyfs.c | 86 +- common/src/unifyfs_configurator.h | 1 + common/src/unifyfs_keyval.c | 2 + common/src/unifyfs_logio.c | 2 +- common/src/unifyfs_meta.h | 46 +- common/src/unifyfs_rpc_types.h | 1 + common/src/unifyfs_server_rpcs.h | 26 + examples/src/testutil.h | 3 + examples/src/writeread.c | 38 +- server/src/Makefile.am | 2 +- server/src/margo_server.c | 178 +- server/src/margo_server.h | 11 + ...yfs_cmd_handler.c => unifyfs_client_rpc.c} | 137 +- server/src/unifyfs_fops_rpc.c | 45 +- server/src/unifyfs_global.h | 25 +- server/src/unifyfs_group_rpc.c | 1756 ++++++++--------- server/src/unifyfs_group_rpc.h | 35 +- server/src/unifyfs_inode.c | 99 +- server/src/unifyfs_inode.h | 60 +- server/src/unifyfs_inode_tree.c | 24 +- server/src/unifyfs_p2p_rpc.c | 1229 ++++++++---- server/src/unifyfs_p2p_rpc.h | 53 +- server/src/unifyfs_request_manager.c | 365 +--- server/src/unifyfs_request_manager.h | 14 +- server/src/unifyfs_server.c | 43 +- server/src/unifyfs_server_pid.c | 135 +- server/src/unifyfs_service_manager.c | 1147 ++++++++--- server/src/unifyfs_service_manager.h | 44 +- util/unifyfs/src/unifyfs-rm.c | 35 +- 34 files changed, 3408 insertions(+), 2375 deletions(-) rename server/src/{unifyfs_cmd_handler.c => unifyfs_client_rpc.c} (87%) diff --git a/client/src/client_read.c b/client/src/client_read.c index 3166e8ed2..476ba9fd0 100644 --- a/client/src/client_read.c +++ b/client/src/client_read.c @@ -84,19 +84,6 @@ client_mread_status* client_create_mread_request(int n_reads, mread->n_reads = (unsigned int) n_reads; ABT_mutex_create(&(mread->sync)); - rc = pthread_mutex_init(&(mread->mutex), NULL); - if (rc != 0) { - LOGERR("client mread status pthread mutex init failed"); - free(mread); - return NULL; - } - rc = pthread_cond_init(&(mread->completed), NULL); - if (rc != 0) { - LOGERR("client mread status pthread condition init failed"); - free(mread); - return NULL; - } - return mread; } @@ -116,8 +103,6 @@ int client_remove_mread_request(client_mread_status* mread) void* list_item = arraylist_remove(active_mreads, list_index); if (list_item == (void*)mread) { ABT_mutex_free(&(mread->sync)); - pthread_cond_destroy(&(mread->completed)); - pthread_mutex_destroy(&(mread->mutex)); free(mread); return UNIFYFS_SUCCESS; } else { @@ -166,8 +151,6 @@ int client_update_mread_request(client_mread_status* mread, ABT_mutex_lock(mread->sync); if (req_index < mread->n_reads) { - LOGDBG("updating mread[%u] status for request %u", - mread->id, req_index); read_req_t* rdreq = mread->reqs + req_index; if (req_complete) { mread->n_complete++; @@ -178,6 +161,10 @@ int client_update_mread_request(client_mread_status* mread, } else { rdreq->nread = rdreq->cover_end_offset + 1; } + LOGINFO("updating mread[%u] status for request %u of %u " + "(n_complete=%u, n_error=%u)", + mread->id, req_index, mread->n_reads, + mread->n_complete, mread->n_error); } } else { LOGERR("invalid read request index %u (mread[%u] has %u reqs)", @@ -189,10 +176,8 @@ int client_update_mread_request(client_mread_status* mread, ABT_mutex_unlock(mread->sync); if (complete) { - /* Signal client thread waiting on mread completion */ - LOGDBG("mread[%u] signaling completion of %u requests", + LOGDBG("mread[%u] completed %u requests", mread->id, mread->n_reads); - pthread_cond_signal(&(mread->completed)); } return ret; @@ -603,33 +588,28 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) /* wait for all requests to finish by blocking on mread * completion condition (with a reasonable timeout) */ LOGDBG("waiting for completion of mread[%u]", mread->id); - pthread_mutex_lock(&(mread->mutex)); - - /* this loop is a workaround for having a single timed condition wait - * that occasionally fails to receive the condition signal */ - int wait_rc = 0; - int wait_interval = UNIFYFS_CLIENT_READ_TIMEOUT_SECONDS / 10; - int wait_time = 0; - while ((mread->n_complete < mread->n_reads) && - (wait_time < UNIFYFS_CLIENT_READ_TIMEOUT_SECONDS)) { - struct timespec timeout; - clock_gettime(CLOCK_REALTIME, &timeout); - timeout.tv_sec += wait_interval; - wait_rc = pthread_cond_timedwait(&(mread->completed), - &(mread->mutex), &timeout); - if (wait_rc) { - if (ETIMEDOUT == wait_rc) { - wait_time += wait_interval; - } else { - LOGERR("mread[%u] condition wait failed (err=%d)", - mread->id, wait_rc); - ret = wait_rc; - break; - } + + /* this loop uses usleep() instead of pthread_cond_timedwait() + * because that method caused unexplained read timeouts */ + int wait_time_ms = 0; + int complete = 0; + while (1) { + ABT_mutex_lock(mread->sync); + complete = (mread->n_complete == mread->n_reads); + ABT_mutex_unlock(mread->sync); + if (complete) { + break; } + + if ((wait_time_ms / 1000) >= UNIFYFS_CLIENT_READ_TIMEOUT_SECONDS) { + LOGERR("mread[%u] timed out", mread->id); + break; + } + + usleep(50000); /* sleep 50 ms */ + wait_time_ms += 50; } - if (wait_time >= UNIFYFS_CLIENT_READ_TIMEOUT_SECONDS) { - LOGERR("mread[%u] timed out", mread->id); + if (!complete) { for (i = 0; i < server_count; i++) { if (EINPROGRESS == server_reqs[i].errcode) { server_reqs[i].errcode = ETIMEDOUT; @@ -637,9 +617,8 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) } } } - LOGDBG("mread[%u] wait completed (rc=%d) - %u requests, %u errors", - mread->id, wait_rc, mread->n_reads, mread->n_error); - pthread_mutex_unlock(&(mread->mutex)); + LOGDBG("mread[%u] wait completed - %u requests, %u errors", + mread->id, mread->n_reads, mread->n_error); } /* got all of the data we'll get from the server, check for short reads diff --git a/client/src/client_read.h b/client/src/client_read.h index 02c366d6e..f7202f802 100644 --- a/client/src/client_read.h +++ b/client/src/client_read.h @@ -28,17 +28,12 @@ typedef struct { unsigned int id; /* unique id for this set of read requests */ unsigned int n_reads; /* number of read requests */ - unsigned int n_complete; /* number of completed requests */ - unsigned int n_error; /* number of requests that encountered errors */ read_req_t* reqs; /* array of read requests */ - /* the following is for synchronizing access/updates to above state */ + /* the following is for synchronizing access/updates to below state */ ABT_mutex sync; - - /* pthread mutex and condition used to signal the client thread that - * issued the mread that the full set of requests has been processed */ - pthread_mutex_t mutex; - pthread_cond_t completed; + volatile unsigned int n_complete; /* number of completed requests */ + volatile unsigned int n_error; /* number of requests that had errors */ } client_mread_status; /* an arraylist to maintain the active mread requests for the client */ diff --git a/client/src/margo_client.c b/client/src/margo_client.c index a6caa9d30..e95c284a4 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -793,29 +793,51 @@ static void unifyfs_mread_req_data_rpc(hg_handle_t handle) assert(mid != MARGO_INSTANCE_NULL); /* register user buffer for bulk access */ - hg_bulk_t bulk_handle; + hg_bulk_t bulk_local; hret = margo_bulk_create(mid, 1, &user_buf, &data_size, - HG_BULK_WRITE_ONLY, &bulk_handle); + HG_BULK_WRITE_ONLY, &bulk_local); if (hret != HG_SUCCESS) { LOGERR("margo_bulk_create() failed"); ret = UNIFYFS_ERROR_MARGO; } else { - /* do bulk transfer */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in.bulk_data, 0, - bulk_handle, 0, data_size); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_transfer() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { + /* execute the transfer to pull data from remote side + * into our local buffer. + * + * NOTE: mercury/margo bulk transfer does not check the + * maximum transfer size that the underlying transport + * supports, and a large bulk transfer may result in + * failure. */ + int i = 0; + hg_size_t remain = in.bulk_size; + do { + hg_size_t offset = i * MAX_BULK_TX_SIZE; + hg_size_t len = remain < MAX_BULK_TX_SIZE ? + remain : MAX_BULK_TX_SIZE; + hret = margo_bulk_transfer(mid, HG_BULK_PULL, + hgi->addr, + in.bulk_data, offset, + bulk_local, offset, + len); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer(buf_offset=%zu, " + "len=%zu) failed", + (size_t)offset, (size_t)len); + ret = UNIFYFS_ERROR_MARGO; + break; + } + remain -= len; + i++; + } while (remain > 0); + + if (hret == HG_SUCCESS) { ABT_mutex_lock(mread->sync); update_read_req_coverage(rdreq, data_offset, data_size); ABT_mutex_unlock(mread->sync); - LOGDBG("updated coverage for mread[%d] request %d", + LOGINFO("updated coverage for mread[%d] request %d", client_mread, read_index); } - margo_bulk_free(bulk_handle); + margo_bulk_free(bulk_local); } } } diff --git a/client/src/unifyfs-dirops.c b/client/src/unifyfs-dirops.c index 6c2f29a34..80b7e9f7d 100644 --- a/client/src/unifyfs-dirops.c +++ b/client/src/unifyfs-dirops.c @@ -133,7 +133,7 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) return NULL; } } else { - fid = unifyfs_fid_create_file(upath); + fid = unifyfs_fid_create_file(upath, 0); if (fid < 0) { errno = unifyfs_rc_errno(-fid); return NULL; diff --git a/client/src/unifyfs-internal.h b/client/src/unifyfs-internal.h index 0344e6eb9..000381c68 100644 --- a/client/src/unifyfs-internal.h +++ b/client/src/unifyfs-internal.h @@ -316,8 +316,8 @@ typedef struct { * If cover_begin_offset != 0, there is a gap at the beginning * of the read extent that should be zero-filled. * If cover_end_offset != (length - 1), it was a short read. */ - size_t cover_begin_offset; - size_t cover_end_offset; + volatile size_t cover_begin_offset; + volatile size_t cover_end_offset; /* nread is the user-visible number of bytes read. Since this includes * any gaps, nread should be set to (cover_end_offset + 1) when the @@ -522,7 +522,8 @@ int unifyfs_fid_free(int fid); /* add a new file and initialize metadata * returns the new fid, or negative value on error */ -int unifyfs_fid_create_file(const char* path); +int unifyfs_fid_create_file(const char* path, + int exclusive); /* add a new directory and initialize metadata * returns the new fid, or a negative value on error */ diff --git a/client/src/unifyfs.c b/client/src/unifyfs.c index 8b2a93bb9..1c64b695b 100644 --- a/client/src/unifyfs.c +++ b/client/src/unifyfs.c @@ -288,7 +288,7 @@ static void unifyfs_normalize_path(const char* path, char* normalized) #endif /* USE_SPATH */ } -/* Given a path, which may relative or absoluate, +/* Given a path, which may relative or absolute, * return 1 if we should intercept the path, 0 otherwise. * If path is to be intercepted, returned a normalized version in upath. */ inline int unifyfs_intercept_path(const char* path, char* upath) @@ -394,7 +394,7 @@ inline int unifyfs_intercept_dirstream(DIR* dirp) return 0; } -/* given a path, return the file id */ +/* given a path, return the local file id, or -1 if not found */ inline int unifyfs_get_fid_from_path(const char* path) { /* scan through active entries in filelist array looking @@ -874,7 +874,8 @@ int unifyfs_fid_free(int fid) /* add a new file and initialize metadata * returns the new fid, or negative value on error */ -int unifyfs_fid_create_file(const char* path) +int unifyfs_fid_create_file(const char* path, + int exclusive) { /* check that pathname is within bounds */ size_t pathlen = strlen(path) + 1; @@ -906,6 +907,7 @@ int unifyfs_fid_create_file(const char* path) meta->attrs.size = 0; meta->attrs.mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; meta->attrs.is_laminated = 0; + meta->attrs.is_shared = !exclusive; meta->attrs.filename = (char*)&(unifyfs_filelist[fid].filename); /* use client user/group */ @@ -931,12 +933,13 @@ int unifyfs_fid_create_file(const char* path) return fid; } +/* create directory state for given path. returns success|error */ int unifyfs_fid_create_directory(const char* path) { /* check that pathname is within bounds */ size_t pathlen = strlen(path) + 1; if (pathlen > UNIFYFS_MAX_FILENAME) { - return (int) ENAMETOOLONG; + return ENAMETOOLONG; } /* get local and global file ids */ @@ -944,16 +947,17 @@ int unifyfs_fid_create_directory(const char* path) int gfid = unifyfs_generate_gfid(path); /* test whether we have info for file in our local file list */ - int found_local = (fid >= 0); + int found_local = (fid != -1); /* test whether we have metadata for file in global key/value store */ - unifyfs_file_attr_t gfattr = { 0, }; - if (unifyfs_get_global_file_meta(gfid, &gfattr) == UNIFYFS_SUCCESS) { - /* can't create if it already exists */ - return EEXIST; + int found_global = 0; + unifyfs_file_attr_t gfattr = { 0 }; + int rc = unifyfs_get_global_file_meta(gfid, &gfattr); + if (UNIFYFS_SUCCESS == rc) { + found_global = 1; } - if (found_local) { + if (found_local && !found_global) { /* exists locally, but not globally * * FIXME: so, we have detected the cache inconsistency here. @@ -967,7 +971,7 @@ int unifyfs_fid_create_directory(const char* path) * deletes the global entry without checking any local used entries * in other processes. * - * we currently return EEXIS, and this needs to be addressed according + * we currently return EEXIST, and this needs to be addressed according * to a consistency model this fs intance assumes. */ return EEXIST; @@ -975,21 +979,31 @@ int unifyfs_fid_create_directory(const char* path) /* now, we need to create a new directory. we reuse the file creation * method and then update the mode to indicate it's a directory */ - fid = unifyfs_fid_create_file(path); - if (fid < 0) { - return -fid; - } - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - assert(meta != NULL); - meta->attrs.mode = (meta->attrs.mode & ~S_IFREG) | S_IFDIR; + if (!found_local) { + /* create a new file */ + fid = unifyfs_fid_create_file(path, 0); + if (fid < 0) { + /* convert negative error code to positive */ + return -fid; + } - /* insert global meta data for directory */ - unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; - int ret = unifyfs_set_global_file_meta_from_fid(fid, op); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("Failed to populate the global meta entry for %s (fid:%d)", - path, fid); - return ret; + /* mark it as a directory */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + assert(meta != NULL); + meta->attrs.mode = (meta->attrs.mode & ~S_IFREG) | S_IFDIR; + + if (!found_global) { + /* insert global meta data for directory */ + unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; + rc = unifyfs_set_global_file_meta_from_fid(fid, op); + if (rc != UNIFYFS_SUCCESS) { + if (rc != EEXIST) { + LOGERR("Failed to add global metadata for dir %s (rc=%d)", + path, rc); + return rc; + } /* else, someone else created global metadata first */ + } + } } return UNIFYFS_SUCCESS; @@ -1186,6 +1200,8 @@ int unifyfs_fid_open( /* determine whether any write flags are specified */ int open_for_write = flags & (O_RDWR | O_WRONLY); + int exclusive = flags & O_EXCL; + /* struct to hold global metadata for file */ unifyfs_file_attr_t gfattr = { 0, }; @@ -1214,7 +1230,7 @@ int unifyfs_fid_open( * allocate a local file id structure if needed */ if (!found_local) { /* initialize local metadata for this file */ - fid = unifyfs_fid_create_file(path); + fid = unifyfs_fid_create_file(path, exclusive); if (fid < 0) { LOGERR("failed to create a new file %s", path); return -fid; @@ -1235,7 +1251,7 @@ int unifyfs_fid_open( /* insert file attribute for file in key-value store */ unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; ret = unifyfs_set_global_file_meta_from_fid(fid, op); - if (ret == EEXIST && !(flags & O_EXCL)) { + if (ret == EEXIST && !exclusive) { /* File didn't exist before, but now it does. * Another process beat us to the punch in creating it. * Read its metadata to update our cache. */ @@ -1295,7 +1311,7 @@ int unifyfs_fid_open( * allocate a local file id structure if needed */ if (!found_local) { /* initialize local metadata for this file */ - fid = unifyfs_fid_create_file(path); + fid = unifyfs_fid_create_file(path, 0); if (fid < 0) { LOGERR("failed to create a new file %s", path); return -fid; @@ -2038,15 +2054,15 @@ int unifyfs_mount( } /* add mount point as a new directory in the file list */ - if (unifyfs_get_fid_from_path(prefix) < 0) { + int fid = unifyfs_get_fid_from_path(prefix); + if (fid < 0) { /* no entry exists for mount point, so create one */ - int fid = unifyfs_fid_create_directory(prefix); - if (fid < 0) { - /* if there was an error, return it */ - LOGERR("failed to create directory entry for mount point: `%s'", - prefix); + rc = unifyfs_fid_create_directory(prefix); + if ((rc != UNIFYFS_SUCCESS) && (rc != EEXIST)) { + /* if there was an error other than EEXIST, return it */ + LOGERR("failed to create directory for mount point: %s", prefix); unifyfs_fini(); - return UNIFYFS_FAILURE; + return rc; } } diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index 5ffa9474d..16c797705 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -83,6 +83,7 @@ UNIFYFS_CFG(logio, shmem_size, INT, UNIFYFS_LOGIO_SHMEM_SIZE, "log-based I/O shared memory region size", NULL) \ UNIFYFS_CFG(logio, spill_size, INT, UNIFYFS_LOGIO_SPILL_SIZE, "log-based I/O spillover file size", NULL) \ UNIFYFS_CFG(logio, spill_dir, STRING, NULLSTRING, "spillover directory", configurator_directory_check) \ + UNIFYFS_CFG(margo, lazy_connect, BOOL, off, "wait until first communication with server to resolve its connection address", NULL) \ UNIFYFS_CFG(margo, tcp, BOOL, on, "use TCP for server-to-server margo RPCs", NULL) \ UNIFYFS_CFG(meta, db_name, STRING, META_DEFAULT_DB_NAME, "metadata database name", NULL) \ UNIFYFS_CFG(meta, db_path, STRING, RUNDIR, "metadata database path", configurator_directory_check) \ diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index 6a38a1e53..ddd3de4da 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -785,6 +785,7 @@ static int unifyfs_fskv_publish_remote(const char* key, return (int)UNIFYFS_SUCCESS; } +#if (!defined(USE_PMI2)) && (!defined(USE_PMIX)) static int unifyfs_fskv_fence(void) { if (!have_sharedfs_kvstore) { @@ -800,6 +801,7 @@ static int unifyfs_fskv_fence(void) return (int)UNIFYFS_SUCCESS; } +#endif //--------------------- K-V Store API --------------------- diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c index afcd6f9b7..0b746fb5c 100644 --- a/common/src/unifyfs_logio.c +++ b/common/src/unifyfs_logio.c @@ -415,7 +415,7 @@ int unifyfs_logio_init_client(const int app_id, char* spill = (char*) spill_mapping; rc = init_log_header(spill, spill_size, chunk_size); if (rc != UNIFYFS_SUCCESS) { - LOGERR("Failed to initialize shmem logio header"); + LOGERR("Failed to initialize spill logio header"); return rc; } } diff --git a/common/src/unifyfs_meta.h b/common/src/unifyfs_meta.h index 3307f13c8..b21cc8ce9 100644 --- a/common/src/unifyfs_meta.h +++ b/common/src/unifyfs_meta.h @@ -32,6 +32,10 @@ extern "C" { #endif +#ifndef UNIFYFS_METADATA_CACHE_SECONDS +# define UNIFYFS_METADATA_CACHE_SECONDS 5 +#endif + /* extent slice size used for metadata */ extern size_t meta_slice_sz; @@ -68,6 +72,9 @@ typedef struct { /* Set when the file is laminated */ int is_laminated; + /* Set when file is shared between clients */ + int is_shared; + /* essential stat fields */ uint32_t mode; /* st_mode bits */ uint32_t uid; @@ -86,22 +93,17 @@ enum { }; static inline -int unifyfs_file_attr_set_invalid(unifyfs_file_attr_t* attr) +void unifyfs_file_attr_set_invalid(unifyfs_file_attr_t* attr) { - if (!attr) { - return EINVAL; - } - memset(attr, 0, sizeof(*attr)); - attr->filename = NULL; - attr->gfid = -1; + attr->filename = NULL; + attr->gfid = -1; attr->is_laminated = -1; - attr->mode = -1; - attr->uid = -1; - attr->gid = -1; - attr->size = (uint64_t) -1; - - return 0; + attr->is_shared = -1; + attr->mode = (uint32_t) -1; + attr->uid = (uint32_t) -1; + attr->gid = (uint32_t) -1; + attr->size = (uint64_t) -1; } static inline @@ -110,10 +112,12 @@ void debug_print_file_attr(unifyfs_file_attr_t* attr) if (!attr) { return; } - LOGDBG("fileattr(%p) - gfid=%d filename=%s laminated=%d", - attr, attr->gfid, attr->filename, attr->is_laminated); + LOGDBG("fileattr(%p) - gfid=%d filename=%s", + attr, attr->gfid, attr->filename); LOGDBG(" - sz=%zu mode=%o uid=%d gid=%d", (size_t)attr->size, attr->mode, attr->uid, attr->gid); + LOGDBG(" - shared=%d laminated=%d", + attr->is_shared, attr->is_laminated); LOGDBG(" - atime=%ld.%09ld ctime=%ld.%09ld mtime=%ld.%09ld", attr->atime.tv_sec, attr->atime.tv_nsec, attr->ctime.tv_sec, attr->ctime.tv_nsec, @@ -151,7 +155,7 @@ int unifyfs_file_attr_update(int attr_op, /* Update fields only with valid values and associated operation. * invalid values are set by unifyfs_file_attr_set_invalid() above */ - if ((src->mode != -1) && + if ((src->mode != (uint32_t)-1) && ((attr_op == UNIFYFS_FILE_ATTR_OP_CHMOD) || (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE) || (attr_op == UNIFYFS_FILE_ATTR_OP_LAMINATE))) { @@ -159,13 +163,13 @@ int unifyfs_file_attr_update(int attr_op, dst->mode = src->mode; } - if ((src->uid != -1) && + if ((src->uid != (uint32_t)-1) && ((attr_op == UNIFYFS_FILE_ATTR_OP_CHOWN) || (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE))) { dst->uid = src->uid; } - if ((src->gid != -1) && + if ((src->gid != (uint32_t)-1) && ((attr_op == UNIFYFS_FILE_ATTR_OP_CHGRP) || (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE))) { dst->gid = src->gid; @@ -216,6 +220,12 @@ int unifyfs_file_attr_update(int attr_op, dst->is_laminated = src->is_laminated; } + if ((src->is_shared != -1) && + (attr_op == UNIFYFS_FILE_ATTR_OP_CREATE)) { + LOGDBG("setting attr.is_shared to %d", src->is_shared); + dst->is_shared = src->is_shared; + } + if (src->filename && !dst->filename) { LOGDBG("setting attr.filename to %s", src->filename); dst->filename = strdup(src->filename); diff --git a/common/src/unifyfs_rpc_types.h b/common/src/unifyfs_rpc_types.h index 126ffdd3e..5a07d7587 100644 --- a/common/src/unifyfs_rpc_types.h +++ b/common/src/unifyfs_rpc_types.h @@ -32,6 +32,7 @@ MERCURY_GEN_STRUCT_PROC(sys_timespec_t, MERCURY_GEN_STRUCT_PROC(unifyfs_file_attr_t, ((int32_t)(gfid)) ((int32_t)(is_laminated)) + ((int32_t)(is_shared)) ((uint32_t)(mode)) ((uint32_t)(uid)) ((uint32_t)(gid)) diff --git a/common/src/unifyfs_server_rpcs.h b/common/src/unifyfs_server_rpcs.h index 349e1f11b..cf46a9373 100644 --- a/common/src/unifyfs_server_rpcs.h +++ b/common/src/unifyfs_server_rpcs.h @@ -31,6 +31,24 @@ extern "C" { #endif +typedef enum { + UNIFYFS_SERVER_RPC_INVALID = 0, + UNIFYFS_SERVER_RPC_CHUNK_READ, + UNIFYFS_SERVER_RPC_EXTENTS_ADD, + UNIFYFS_SERVER_RPC_EXTENTS_FIND, + UNIFYFS_SERVER_RPC_FILESIZE, + UNIFYFS_SERVER_RPC_LAMINATE, + UNIFYFS_SERVER_RPC_METAGET, + UNIFYFS_SERVER_RPC_METASET, + UNIFYFS_SERVER_RPC_PID_REPORT, + UNIFYFS_SERVER_RPC_TRUNCATE, + UNIFYFS_SERVER_BCAST_RPC_EXTENTS, + UNIFYFS_SERVER_BCAST_RPC_FILEATTR, + UNIFYFS_SERVER_BCAST_RPC_LAMINATE, + UNIFYFS_SERVER_BCAST_RPC_TRUNCATE, + UNIFYFS_SERVER_BCAST_RPC_UNLINK +} server_rpc_e; + /*---- Server Point-to-Point (p2p) RPCs ----*/ /* Report server pid to rank 0 */ @@ -48,6 +66,7 @@ MERCURY_GEN_PROC(chunk_read_request_in_t, ((int32_t)(client_id)) ((int32_t)(req_id)) ((int32_t)(num_chks)) + ((hg_size_t)(total_data_size)) ((hg_size_t)(bulk_size)) ((hg_bulk_t)(bulk_handle))) MERCURY_GEN_PROC(chunk_read_request_out_t, @@ -131,6 +150,13 @@ DECLARE_MARGO_RPC_HANDLER(truncate_rpc) /*---- Collective RPCs ----*/ +/* Finish an ongoing broadcast rpc */ +MERCURY_GEN_PROC(bcast_progress_in_t, + ((hg_ptr_t)(coll_req))) +MERCURY_GEN_PROC(bcast_progress_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(bcast_progress_rpc) + /* Broadcast file extents to all servers */ MERCURY_GEN_PROC(extent_bcast_in_t, ((int32_t)(root)) diff --git a/examples/src/testutil.h b/examples/src/testutil.h index b70bb6eb6..5ff920706 100644 --- a/examples/src/testutil.h +++ b/examples/src/testutil.h @@ -1124,6 +1124,9 @@ int test_create_file(test_cfg* cfg, const char* filepath, int access) } cfg->fp = fp; } else { + if (cfg->io_pattern == IO_PATTERN_NN) { + create_flags |= O_EXCL; + } fd = open(filepath, access | create_flags, create_mode); if (-1 == fd) { test_print(cfg, "ERROR: open(%s, CREAT) failed", filepath); diff --git a/examples/src/writeread.c b/examples/src/writeread.c index 1ea2a0be3..369c2eb0d 100644 --- a/examples/src/writeread.c +++ b/examples/src/writeread.c @@ -220,15 +220,17 @@ int main(int argc, char* argv[]) // create file target_file = test_target_filename(cfg); - test_print_verbose_once(cfg, "DEBUG: creating target file %s", - target_file); + test_print_verbose_once(cfg, + "DEBUG: creating target file %s", target_file); timer_start_barrier(cfg, &time_create); rc = test_create_file(cfg, target_file, O_RDWR); if (rc) { test_abort(cfg, rc); } timer_stop_barrier(cfg, &time_create); - test_print_verbose_once(cfg, "DEBUG: finished create"); + test_print_verbose_once(cfg, + "DEBUG: finished create (elapsed=%.6lf sec)", + time_create.elapsed_sec_all); if (cfg->pre_wr_trunc) { write_truncate(cfg); @@ -257,7 +259,9 @@ int main(int argc, char* argv[]) test_abort(cfg, rc); } timer_stop_barrier(cfg, &time_wr); - test_print_verbose_once(cfg, "DEBUG: finished write requests"); + test_print_verbose_once(cfg, + "DEBUG: finished write requests (elapsed=%.6lf sec)", + time_wr.elapsed_sec_all); // sync timer_start_barrier(cfg, &time_sync); @@ -266,13 +270,17 @@ int main(int argc, char* argv[]) test_abort(cfg, rc); } timer_stop_barrier(cfg, &time_sync); - test_print_verbose_once(cfg, "DEBUG: finished sync"); + test_print_verbose_once(cfg, + "DEBUG: finished sync (elapsed=%.6lf sec)", + time_sync.elapsed_sec_all); // stat file pre-laminate timer_start_barrier(cfg, &time_stat_pre); stat_file(cfg, target_file); timer_stop_barrier(cfg, &time_stat_pre); - test_print_verbose_once(cfg, "DEBUG: finished stat pre-laminate"); + test_print_verbose_once(cfg, + "DEBUG: finished stat pre-laminate (elapsed=%.6lf sec)", + time_stat_pre.elapsed_sec_all); if (cfg->post_wr_trunc) { write_truncate(cfg); @@ -281,7 +289,9 @@ int main(int argc, char* argv[]) timer_start_barrier(cfg, &time_stat_pre2); stat_file(cfg, target_file); timer_stop_barrier(cfg, &time_stat_pre2); - test_print_verbose_once(cfg, "DEBUG: finished stat pre2 (post trunc)"); + test_print_verbose_once(cfg, + "DEBUG: finished stat pre2 (post trunc, elapsed=%.6lf sec)", + time_stat_pre2.elapsed_sec_all); } // laminate @@ -291,13 +301,17 @@ int main(int argc, char* argv[]) test_abort(cfg, rc); } timer_stop_barrier(cfg, &time_laminate); - test_print_verbose_once(cfg, "DEBUG: finished laminate"); + test_print_verbose_once(cfg, + "DEBUG: finished laminate (elapsed=%.6lf sec)", + time_laminate.elapsed_sec_all); // stat file post-laminate timer_start_barrier(cfg, &time_stat_post); stat_cmd(cfg, target_file); timer_stop_barrier(cfg, &time_stat_post); - test_print_verbose_once(cfg, "DEBUG: finished stat post-laminate"); + test_print_verbose_once(cfg, + "DEBUG: finished stat post-laminate (elapsed=%.6lf sec)", + time_stat_post.elapsed_sec_all); // post-write cleanup free(wr_buf); @@ -327,7 +341,9 @@ int main(int argc, char* argv[]) test_abort(cfg, rc); } timer_stop_barrier(cfg, &time_rd); - test_print_verbose_once(cfg, "DEBUG: finished read requests"); + test_print_verbose_once(cfg, + "DEBUG: finished read requests (elapsed=%.6lf sec)", + time_rd.elapsed_sec_all); if (test_config.io_check) { test_print_verbose_once(cfg, "DEBUG: verifying data"); @@ -404,7 +420,7 @@ int main(int argc, char* argv[]) test_print_once(cfg, "Stat Time Pre-Laminate is %.6lf s", time_stat_pre.elapsed_sec_all); test_print_once(cfg, "Stat Time Pre-Laminate2 is %.6lf s", - time_stat_pre.elapsed_sec_all); + time_stat_pre2.elapsed_sec_all); test_print_once(cfg, "File Laminate Time is %.6lf s", time_laminate.elapsed_sec_all); test_print_once(cfg, "Stat Time Post-Laminate is %.6lf s", diff --git a/server/src/Makefile.am b/server/src/Makefile.am index 3704d2e3e..9e74de61e 100644 --- a/server/src/Makefile.am +++ b/server/src/Makefile.am @@ -14,7 +14,7 @@ unifyfsd_SOURCES = \ extent_tree.h \ margo_server.c \ margo_server.h \ - unifyfs_cmd_handler.c \ + unifyfs_client_rpc.c \ unifyfs_fops.h \ unifyfs_global.h \ unifyfs_group_rpc.h \ diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 070bae3c7..86450bce9 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -129,10 +129,10 @@ static margo_instance_id setup_remote_target(void) /* register server-server RPCs */ static void register_server_server_rpcs(margo_instance_id mid) { - unifyfsd_rpc_context->rpcs.server_pid_id = - MARGO_REGISTER(mid, "server_pid_rpc", - server_pid_in_t, server_pid_out_t, - server_pid_rpc); + unifyfsd_rpc_context->rpcs.bcast_progress_id = + MARGO_REGISTER(mid, "bcast_progress_rpc", + bcast_progress_in_t, bcast_progress_out_t, + bcast_progress_rpc); unifyfsd_rpc_context->rpcs.chunk_read_request_id = MARGO_REGISTER(mid, "chunk_read_request_rpc", @@ -189,6 +189,11 @@ static void register_server_server_rpcs(margo_instance_id mid) metaset_in_t, metaset_out_t, metaset_rpc); + unifyfsd_rpc_context->rpcs.server_pid_id = + MARGO_REGISTER(mid, "server_pid_rpc", + server_pid_in_t, server_pid_out_t, + server_pid_rpc); + unifyfsd_rpc_context->rpcs.truncate_id = MARGO_REGISTER(mid, "truncate_rpc", truncate_in_t, truncate_out_t, @@ -405,8 +410,10 @@ int margo_server_rpc_finalize(void) } /* shut down margo */ + LOGDBG("finalizing server-server margo"); margo_finalize(ctx->svr_mid); /* NOTE: 2nd call to margo_finalize() sometimes crashes - Margo bug? */ + LOGDBG("finalizing client-server margo"); margo_finalize(ctx->shm_mid); /* free memory allocated for context structure */ @@ -416,6 +423,32 @@ int margo_server_rpc_finalize(void) return rc; } +int margo_connect_server(int rank) +{ + assert(rank < glb_num_servers); + + int ret = UNIFYFS_SUCCESS; + char* margo_addr_str = rpc_lookup_remote_server_addr(rank); + if (NULL == margo_addr_str) { + LOGERR("server index=%d - margo server lookup failed", rank); + ret = UNIFYFS_ERROR_KEYVAL; + return ret; + } + glb_servers[rank].margo_svr_addr_str = margo_addr_str; + LOGDBG("server rank=%d, margo_addr=%s", rank, margo_addr_str); + + hg_return_t hret = margo_addr_lookup(unifyfsd_rpc_context->svr_mid, + glb_servers[rank].margo_svr_addr_str, + &(glb_servers[rank].margo_svr_addr)); + if (hret != HG_SUCCESS) { + LOGERR("server index=%zu - margo_addr_lookup(%s) failed", + rank, margo_addr_str); + ret = UNIFYFS_ERROR_MARGO; + } + + return ret; +} + /* margo_connect_servers * * Using address strings found in glb_servers, resolve @@ -424,54 +457,25 @@ int margo_server_rpc_finalize(void) int margo_connect_servers(void) { int rc; - int ret = (int)UNIFYFS_SUCCESS; - size_t i; - hg_return_t hret; + int ret = UNIFYFS_SUCCESS; + int i; // block until a margo_svr key pair published by all servers rc = unifyfs_keyval_fence_remote(); if ((int)UNIFYFS_SUCCESS != rc) { LOGERR("keyval fence on margo_svr key failed"); - ret = (int)UNIFYFS_FAILURE; + ret = UNIFYFS_ERROR_KEYVAL; return ret; } - for (i = 0; i < glb_num_servers; i++) { - int remote_pmi_rank = -1; - char* pmi_rank_str = NULL; - char* margo_addr_str = NULL; - - rc = unifyfs_keyval_lookup_remote(i, key_unifyfsd_pmi_rank, - &pmi_rank_str); - if ((int)UNIFYFS_SUCCESS != rc) { - LOGERR("server index=%zu - pmi rank lookup failed", i); - ret = (int)UNIFYFS_FAILURE; - return ret; - } - if (NULL != pmi_rank_str) { - remote_pmi_rank = atoi(pmi_rank_str); - free(pmi_rank_str); - } - glb_servers[i].pmi_rank = remote_pmi_rank; - - margo_addr_str = rpc_lookup_remote_server_addr(i); - if (NULL == margo_addr_str) { - LOGERR("server index=%zu - margo server lookup failed", i); - ret = (int)UNIFYFS_FAILURE; - return ret; - } + for (i = 0; i < (int)glb_num_servers; i++) { + glb_servers[i].pmi_rank = i; glb_servers[i].margo_svr_addr = HG_ADDR_NULL; - glb_servers[i].margo_svr_addr_str = margo_addr_str; - LOGDBG("server index=%zu, pmi_rank=%d, margo_addr=%s", - i, remote_pmi_rank, margo_addr_str); + glb_servers[i].margo_svr_addr_str = NULL; if (!margo_lazy_connect) { - hret = margo_addr_lookup(unifyfsd_rpc_context->svr_mid, - glb_servers[i].margo_svr_addr_str, - &(glb_servers[i].margo_svr_addr)); - if (hret != HG_SUCCESS) { - LOGERR("server index=%zu - margo_addr_lookup(%s) failed", - i, margo_addr_str); - ret = (int)UNIFYFS_FAILURE; + rc = margo_connect_server(i); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; } } } @@ -479,6 +483,96 @@ int margo_connect_servers(void) return ret; } +hg_addr_t get_margo_server_address(int rank) +{ + assert(rank < glb_num_servers); + hg_addr_t addr = glb_servers[rank].margo_svr_addr; + if ((HG_ADDR_NULL == addr) && margo_lazy_connect) { + int rc = margo_connect_server(rank); + if (rc == UNIFYFS_SUCCESS) { + addr = glb_servers[rank].margo_svr_addr; + } + } + return addr; +} + +/* Use passed bulk handle to pull data into a newly allocated buffer. + * If local_bulk is not NULL, will set to local bulk handle on success. + * Returns bulk buffer, or NULL on failure. */ +void* pull_margo_bulk_buffer(hg_handle_t rpc_hdl, + hg_bulk_t bulk_remote, + hg_size_t bulk_sz, + hg_bulk_t* local_bulk) +{ + if (0 == bulk_sz) { + return NULL; + } + + size_t sz = (size_t) bulk_sz; + void* buffer = malloc(sz); + if (NULL == buffer) { + LOGERR("failed to allocate buffer(sz=%zu) for bulk transfer", sz); + return NULL; + } + + /* get mercury info to set up bulk transfer */ + const struct hg_info* hgi = margo_get_info(rpc_hdl); + assert(hgi); + margo_instance_id mid = margo_hg_info_get_instance(hgi); + assert(mid != MARGO_INSTANCE_NULL); + + /* register local target buffer for bulk access */ + hg_bulk_t bulk_local; + hg_return_t hret = margo_bulk_create(mid, 1, &buffer, &bulk_sz, + HG_BULK_READWRITE, &bulk_local); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + free(buffer); + return NULL; + } + + /* execute the transfer to pull data from remote side + * into our local buffer. + * + * NOTE: mercury/margo bulk transfer does not check the maximum + * transfer size that the underlying transport supports, and a + * large bulk transfer may result in failure. */ + int i = 0; + hg_size_t remain = bulk_sz; + do { + hg_size_t offset = i * MAX_BULK_TX_SIZE; + hg_size_t len = remain < MAX_BULK_TX_SIZE ? remain : MAX_BULK_TX_SIZE; + hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, + bulk_remote, offset, + bulk_local, offset, len); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_transfer(buf_offset=%zu, len=%zu) failed", + (size_t)offset, (size_t)len); + break; + } + remain -= len; + i++; + } while (remain > 0); + + if (hret == HG_SUCCESS) { + LOGDBG("successful bulk transfer (%zu bytes)", bulk_sz); + if (local_bulk != NULL) { + *local_bulk = bulk_local; + } else { + /* deregister our bulk transfer buffer */ + margo_bulk_free(bulk_local); + } + return buffer; + } else { + LOGERR("failed bulk transfer - transferred %zu of %zu bytes", + (bulk_sz - remain), bulk_sz); + free(buffer); + return NULL; + } +} + +/* MARGO CLIENT-SERVER RPC INVOCATION FUNCTIONS */ + /* create and return a margo handle for given rpc id and app-client */ static hg_handle_t create_client_handle(hg_id_t id, int app_id, diff --git a/server/src/margo_server.h b/server/src/margo_server.h index 1e0f4e600..151fb641d 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -31,6 +31,7 @@ typedef struct ServerRpcIds { /* server-server rpcs */ + hg_id_t bcast_progress_id; hg_id_t chunk_read_request_id; hg_id_t chunk_read_response_id; hg_id_t extent_add_id; @@ -66,8 +67,18 @@ extern bool margo_lazy_connect; int margo_server_rpc_init(void); int margo_server_rpc_finalize(void); +int margo_connect_server(int rank); int margo_connect_servers(void); +hg_addr_t get_margo_server_address(int rank); + +/* use passed bulk handle to pull data into a newly allocated buffer. + * returns buffer, or NULL on failure. */ +void* pull_margo_bulk_buffer(hg_handle_t rpc_hdl, + hg_bulk_t bulk_in, + hg_size_t bulk_sz, + hg_bulk_t* local_bulk); + /* invokes the client mread request data response rpc function */ int invoke_client_mread_req_data_rpc(int app_id, int client_id, diff --git a/server/src/unifyfs_cmd_handler.c b/server/src/unifyfs_client_rpc.c similarity index 87% rename from server/src/unifyfs_cmd_handler.c rename to server/src/unifyfs_client_rpc.c index 2bb61128b..f8c8f9c0f 100644 --- a/server/src/unifyfs_cmd_handler.c +++ b/server/src/unifyfs_client_rpc.c @@ -43,6 +43,68 @@ #include "unifyfs_misc.h" +static void create_mountpoint_dir(int app_id, + int client_id, + const char* mountpoint) +{ + /* initialize an empty file attributes structure */ + unifyfs_file_attr_t fattr; + unifyfs_file_attr_set_invalid(&fattr); + + /* set global file id and path */ + fattr.gfid = app_id; + fattr.filename = strdup(mountpoint); + + /* set initial directory state */ + fattr.mode = UNIFYFS_STAT_DEFAULT_DIR_MODE; + fattr.is_shared = 1; + fattr.is_laminated = 0; + fattr.size = 0; + + /* use current time for atime/mtime/ctime */ + struct timespec tp = {0}; + clock_gettime(CLOCK_REALTIME, &tp); + fattr.atime = tp; + fattr.mtime = tp; + fattr.ctime = tp; + + /* capture current uid and gid */ + fattr.uid = getuid(); + fattr.gid = getgid(); + + LOGDBG("creating global file metadata for mountpoint:"); + debug_print_file_attr(&fattr); + + /* create metaset request */ + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + unifyfs_metaset_in_t* in = malloc(sizeof(*in)); + if ((NULL == req) || (NULL == in)) { + LOGERR("memory allocation failed"); + return; + } + + req->req_type = UNIFYFS_CLIENT_RPC_METASET; + req->handle = HG_HANDLE_NULL; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + + in->app_id = app_id; + in->client_id = client_id; + in->attr = fattr; + in->attr_op = UNIFYFS_FILE_ATTR_OP_CREATE; + + unifyfs_fops_ctx_t ctx = { + .app_id = app_id, + .client_id = client_id, + }; + + int ret = rm_submit_client_rpc_request(&ctx, req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit metaset request for mountpoint"); + } +} + /* BEGIN MARGO CLIENT-SERVER RPC HANDLER FUNCTIONS */ /* called by client to register with the server, client provides a @@ -61,6 +123,7 @@ static void unifyfs_mount_rpc(hg_handle_t handle) int ret = (int)UNIFYFS_SUCCESS; int app_id = -1; int client_id = -1; + int create_mountpoint = 0; /* get input params */ unifyfs_mount_in_t in; @@ -77,7 +140,7 @@ static void unifyfs_mount_rpc(hg_handle_t handle) if (app_cfg == NULL) { /* insert new app_config into our app_configs array */ LOGDBG("creating new application for app_id=%d", app_id); - app_cfg = new_application(app_id); + app_cfg = new_application(app_id, &create_mountpoint); if (NULL == app_cfg) { ret = UNIFYFS_FAILURE; } @@ -98,6 +161,9 @@ static void unifyfs_mount_rpc(hg_handle_t handle) client_id = client->client_id; LOGDBG("created new application client %d:%d", app_id, client_id); + if (create_mountpoint) { + create_mountpoint_dir(app_id, client_id, in.mount_prefix); + } } } @@ -685,57 +751,32 @@ static void unifyfs_mread_rpc(hg_handle_t handle) } else { /* allocate buffer to hold array of read requests */ hg_size_t size = in->bulk_size; - void* buffer = malloc(size); + void* buffer = pull_margo_bulk_buffer(handle, in->bulk_extents, + size, NULL); if (NULL == buffer) { - ret = ENOMEM; + ret = UNIFYFS_ERROR_MARGO; } else { - /* get mercury info to set up bulk transfer */ - const struct hg_info* hgi = margo_get_info(handle); - assert(hgi); - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); - - /* register local target buffer for bulk access */ - hg_bulk_t bulk_handle; - hret = margo_bulk_create(mid, 1, &buffer, &size, - HG_BULK_WRITE_ONLY, &bulk_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; + client_rpc_req_t* req = malloc(sizeof(*req)); + if (NULL == req) { + ret = ENOMEM; } else { - /* get list of read requests */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in->bulk_extents, 0, - bulk_handle, 0, size); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_transfer() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - client_rpc_req_t* req = malloc(sizeof(*req)); - if (NULL == req) { - ret = ENOMEM; - } else { - unifyfs_fops_ctx_t ctx = { - .app_id = in->app_id, - .client_id = in->client_id - }; - req->req_type = UNIFYFS_CLIENT_RPC_READ; - req->handle = handle; - req->input = (void*) in; - req->bulk_buf = buffer; - req->bulk_sz = size; - ret = rm_submit_client_rpc_request(&ctx, req); - } - - if (ret != UNIFYFS_SUCCESS) { - free(buffer); - if (NULL != req) { - free(req); - } - margo_free_input(handle, in); - } + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id + }; + req->req_type = UNIFYFS_CLIENT_RPC_READ; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = buffer; + req->bulk_sz = size; + ret = rm_submit_client_rpc_request(&ctx, req); + } + if (ret != UNIFYFS_SUCCESS) { + free(buffer); + if (NULL != req) { + free(req); } - margo_bulk_free(bulk_handle); + margo_free_input(handle, in); } } } diff --git a/server/src/unifyfs_fops_rpc.c b/server/src/unifyfs_fops_rpc.c index 9eb0e1811..c0f0c3a47 100644 --- a/server/src/unifyfs_fops_rpc.c +++ b/server/src/unifyfs_fops_rpc.c @@ -106,7 +106,7 @@ int rpc_fsync(unifyfs_fops_ctx_t* ctx, unifyfs_index_t* meta_payload = (unifyfs_index_t*)(ptr_extents); struct extent_tree_node* extents = calloc(num_extents, sizeof(*extents)); - if (!extents) { + if (NULL == extents) { LOGERR("failed to allocate memory for local_extents"); return ENOMEM; } @@ -139,6 +139,8 @@ int rpc_fsync(unifyfs_fops_ctx_t* ctx, LOGERR("failed to add extents (gfid=%d, ret=%d)", gfid, ret); } + free(extents); + return ret; } @@ -169,6 +171,10 @@ static int rpc_unlink(unifyfs_fops_ctx_t* ctx, int gfid) { + int ret = unifyfs_inode_unlink(gfid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unlink(gfid=%d) failed", gfid); + } return unifyfs_invoke_broadcast_unlink(gfid); } @@ -244,7 +250,7 @@ int submit_read_request(unifyfs_fops_ctx_t* ctx, return EINVAL; } - LOGDBG("handling read request (%u chunk requests)", count); + LOGDBG("handling read request (%u extents)", count); /* see if we have a valid app information */ int app_id = ctx->app_id; @@ -317,13 +323,12 @@ int rpc_read(unifyfs_fops_ctx_t* ctx, off_t offset, size_t length) { - unifyfs_inode_extent_t chunk = { 0, }; - - chunk.gfid = gfid; - chunk.offset = offset; - chunk.length = length; + unifyfs_inode_extent_t extent = { 0 }; + extent.gfid = gfid; + extent.offset = (unsigned long) offset; + extent.length = (unsigned long) length; - return submit_read_request(ctx, 1, &chunk); + return submit_read_request(ctx, 1, &extent); } static @@ -333,31 +338,27 @@ int rpc_mread(unifyfs_fops_ctx_t* ctx, { int ret = UNIFYFS_SUCCESS; unsigned int i = 0; - unsigned int count = (unsigned int)n_req; - unifyfs_inode_extent_t* chunks = NULL; + unsigned int count = (unsigned int) n_req; + unifyfs_inode_extent_t* extents = NULL; unifyfs_extent_t* reqs = (unifyfs_extent_t*) read_reqs; - chunks = calloc(n_req, sizeof(*chunks)); - if (!chunks) { + extents = calloc(n_req, sizeof(*extents)); + if (NULL == extents) { LOGERR("failed to allocate the chunk request"); return ENOMEM; } for (i = 0; i < count; i++) { - unifyfs_inode_extent_t* ch = chunks + i; + unifyfs_inode_extent_t* ext = extents + i; unifyfs_extent_t* req = reqs + i; - ch->gfid = req->gfid; - ch->offset = req->offset; - ch->length = req->length; + ext->gfid = req->gfid; + ext->offset = (unsigned long) req->offset; + ext->length = (unsigned long) req->length; } - ret = submit_read_request(ctx, count, chunks); - - if (chunks) { - free(chunks); - chunks = NULL; - } + ret = submit_read_request(ctx, count, extents); + free(extents); return ret; } diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index 4133ccb87..f0f023d4c 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -78,12 +78,6 @@ extern size_t glb_num_servers; /* number of entries in glb_servers array */ extern struct unifyfs_inode_tree* global_inode_tree; /* global inode tree */ -/* defines commands for messages sent to service manager threads */ -typedef enum { - SVC_CMD_INVALID = 0, - SVC_CMD_RDREQ_CHK, /* read requests (chunk_read_req_t) */ -} service_cmd_e; - // NEW READ REQUEST STRUCTURES typedef enum { READREQ_NULL = 0, /* request not initialized */ @@ -102,6 +96,15 @@ typedef struct { int rank; /* remote server rank who holds data */ } chunk_read_req_t; +#define debug_print_chunk_read_req(reqptr) \ +do { \ + chunk_read_req_t* _req = (reqptr); \ + LOGDBG("chunk_read_req(%p) - gfid=%d, offset=%zu, nbytes=%zu @ " \ + "server[%d] log(app=%d, client=%d, offset=%zu)", \ + _req, _req->gfid, _req->offset, _req->nbytes, _req->rank, \ + _req->log_app_id, _req->log_client_id, _req->log_offset); \ +} while (0) + typedef struct { int gfid; /* gfid */ size_t offset; /* file offset */ @@ -166,7 +169,8 @@ typedef struct app_config { app_config* get_application(int app_id); -app_config* new_application(int app_id); +app_config* new_application(int app_id, + int* created); unifyfs_rc cleanup_application(app_config* app); @@ -189,4 +193,11 @@ unifyfs_rc disconnect_app_client(app_client* clnt); unifyfs_rc cleanup_app_client(app_config* app, app_client* clnt); + +/* publish the pids of all servers to a shared file */ +int unifyfs_publish_server_pids(void); + +/* report the pid for a server with given rank */ +int unifyfs_report_server_pid(int rank, int pid); + #endif // UNIFYFS_GLOBAL_H diff --git a/server/src/unifyfs_group_rpc.c b/server/src/unifyfs_group_rpc.c index 2a9814ef6..fdc72f98e 100644 --- a/server/src/unifyfs_group_rpc.c +++ b/server/src/unifyfs_group_rpc.c @@ -12,354 +12,622 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -#include "unifyfs_global.h" -#include "unifyfs_tree.h" -#include "margo_server.h" -#include "unifyfs_server_rpcs.h" #include "unifyfs_group_rpc.h" + #ifndef UNIFYFS_BCAST_K_ARY # define UNIFYFS_BCAST_K_ARY 2 #endif -/* server collective (coll) margo request structure */ -typedef struct { - margo_request request; - hg_handle_t handle; -} coll_request; /* helper method to initialize collective request rpc handle for child peer */ -static int get_request_handle(hg_id_t request_hgid, - int peer_rank, - coll_request* creq) +static int get_child_request_handle(hg_id_t request_hgid, + int peer_rank, + hg_handle_t* chdl) { - int rc = UNIFYFS_SUCCESS; + int ret = UNIFYFS_SUCCESS; /* get address for specified server rank */ - hg_addr_t addr = glb_servers[peer_rank].margo_svr_addr; - - /* get handle to rpc function */ - hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, addr, - request_hgid, &(creq->handle)); - if (hret != HG_SUCCESS) { - LOGERR("failed to get handle for request(%p) to server %d", - creq, peer_rank); - rc = UNIFYFS_ERROR_MARGO; + hg_addr_t addr = get_margo_server_address(peer_rank); + if (HG_ADDR_NULL == addr) { + LOGERR("missing margo address for rank=%d", peer_rank); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* get handle to rpc function */ + hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, addr, + request_hgid, chdl); + if (hret != HG_SUCCESS) { + LOGERR("failed to get handle for child request to server %d", + peer_rank); + ret = UNIFYFS_ERROR_MARGO; + } } - return rc; + return ret; } /* helper method to forward collective rpc request to one child */ -static int forward_request(void* input_ptr, - coll_request* creq) +static int forward_child_request(void* input_ptr, + hg_handle_t chdl, + margo_request* creq) { - int rc = UNIFYFS_SUCCESS; + int ret = UNIFYFS_SUCCESS; /* call rpc function */ - hg_return_t hret = margo_iforward(creq->handle, input_ptr, - &(creq->request)); + hg_return_t hret = margo_iforward(chdl, input_ptr, creq); if (hret != HG_SUCCESS) { LOGERR("failed to forward request(%p)", creq); - rc = UNIFYFS_ERROR_MARGO; + ret = UNIFYFS_ERROR_MARGO; } - return rc; + return ret; } /* helper method to wait for collective rpc child request completion */ -static int wait_for_request(coll_request* creq) +static int wait_for_child_request(margo_request* creq) { - int rc = UNIFYFS_SUCCESS; + int ret = UNIFYFS_SUCCESS; /* call rpc function */ - hg_return_t hret = margo_wait(creq->request); + hg_return_t hret = margo_wait(*creq); if (hret != HG_SUCCESS) { LOGERR("wait on request(%p) failed", creq); - rc = UNIFYFS_ERROR_MARGO; + ret = UNIFYFS_ERROR_MARGO; } - return rc; + return ret; } -/************************************************************************* - * Broadcast file extents metadata - *************************************************************************/ +static coll_request* collective_create(server_rpc_e req_type, + hg_handle_t handle, + hg_id_t op_hgid, + int tree_root_rank, + void* input_struct, + void* output_struct, + size_t output_size, + hg_bulk_t bulk_in, + hg_bulk_t bulk_forward, + void* bulk_buf) +{ + coll_request* coll_req = calloc(1, sizeof(*coll_req)); + if (NULL != coll_req) { + LOGDBG("BCAST_RPC: collective(%p) create (type=%d)", + coll_req, req_type); + coll_req->req_type = req_type; + coll_req->resp_hdl = handle; + coll_req->input = input_struct; + coll_req->output = output_struct; + coll_req->output_sz = output_size; + coll_req->bulk_in = bulk_in; + coll_req->bulk_forward = bulk_forward; + coll_req->bulk_buf = bulk_buf; + + unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, tree_root_rank, + UNIFYFS_BCAST_K_ARY, &(coll_req->tree)); + + size_t n_children = (size_t) coll_req->tree.child_count; + if (n_children) { + coll_req->child_hdls = calloc(n_children, sizeof(hg_handle_t)); + coll_req->child_reqs = calloc(n_children, sizeof(margo_request)); + if ((NULL == coll_req->child_hdls) || + (NULL == coll_req->child_reqs)) { + LOGERR("allocation of children state failed"); + free(coll_req); + return NULL; + } + int* ranks = coll_req->tree.child_ranks; + for (int i = 0; i < coll_req->tree.child_count; i++) { + /* allocate child request handle */ + hg_handle_t* chdl = coll_req->child_hdls + i; + int rc = get_child_request_handle(op_hgid, ranks[i], chdl); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to get child request handle"); + *chdl = HG_HANDLE_NULL; + } + } + } + } + return coll_req; +} -/* file extents metadata broadcast rpc handler */ -static void extent_bcast_rpc(hg_handle_t handle) +/* reset collective input bulk handle to original value */ +static void coll_restore_input_bulk(coll_request* coll_req) { - LOGDBG("MARGOTREE: extent bcast handler"); + void* input = coll_req->input; + if ((NULL == input) || (HG_BULK_NULL == coll_req->bulk_in) + || (HG_BULK_NULL == coll_req->bulk_forward)) { + return; + } - /* assume we'll succeed */ - int32_t ret = UNIFYFS_SUCCESS; + /* update input structure bulk handle using stored value */ + switch (coll_req->req_type) { + case UNIFYFS_SERVER_BCAST_RPC_EXTENTS: { + extent_bcast_in_t* ebi = (extent_bcast_in_t*) input; + ebi->extents = coll_req->bulk_in; + break; + } + case UNIFYFS_SERVER_BCAST_RPC_LAMINATE: { + laminate_bcast_in_t* lbi = (laminate_bcast_in_t*) input; + lbi->extents = coll_req->bulk_in; + break; + } + default: + LOGERR("invalid collective request type %d", coll_req->req_type); + break; + } +} - /* get instance id */ - margo_instance_id mid = margo_hg_handle_get_instance(handle); +static void collective_cleanup(coll_request* coll_req) +{ + if (NULL == coll_req) { + return; + } - /* get input params */ - extent_bcast_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + LOGDBG("BCAST_RPC: collective(%p) cleanup", coll_req); + + /* release communication tree resources */ + unifyfs_tree_free(&(coll_req->tree)); + + /* release margo resources */ + if (HG_HANDLE_NULL != coll_req->resp_hdl) { + if (NULL != coll_req->input) { + coll_restore_input_bulk(coll_req); + margo_free_input(coll_req->resp_hdl, coll_req->input); + } + margo_destroy(coll_req->resp_hdl); + } + if (HG_BULK_NULL != coll_req->bulk_forward) { + margo_bulk_free(coll_req->bulk_forward); + } + + /* free allocated memory */ + if (NULL != coll_req->input) { + free(coll_req->input); + } + if (NULL != coll_req->output) { + free(coll_req->output); + } + if (NULL != coll_req->child_hdls) { + free(coll_req->child_hdls); + } + if (NULL != coll_req->child_reqs) { + free(coll_req->child_reqs); + } + if (NULL != coll_req->bulk_buf) { + free(coll_req->bulk_buf); + } + free(coll_req); +} + +/* Forward the collective request to any children */ +static int collective_forward(coll_request* coll_req) +{ + /* get info for tree */ + int child_count = coll_req->tree.child_count; + if (0 == child_count) { + return UNIFYFS_SUCCESS; + } + + LOGDBG("BCAST_RPC: collective(%p) forward", coll_req); + + /* forward request down the tree */ + int ret = UNIFYFS_SUCCESS; + for (int i = 0; i < child_count; i++) { + /* invoke bcast request rpc on child */ + margo_request* creq = coll_req->child_reqs + i; + hg_handle_t* chdl = coll_req->child_hdls + i; + int rc = forward_child_request(coll_req->input, *chdl, creq); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward to child[%d] failed", i); + ret = rc; + } + } + + return ret; +} + + + +/* set collective output return value to local result value */ +void collective_set_local_retval(coll_request* coll_req, int val) +{ + /* update collective return value using local op return value */ + void* output = coll_req->output; + if (NULL == output) { + return; + } + + switch (coll_req->req_type) { + case UNIFYFS_SERVER_BCAST_RPC_EXTENTS: { + extent_bcast_out_t* ebo = (extent_bcast_out_t*) output; + ebo->ret = val; + break; + } + case UNIFYFS_SERVER_BCAST_RPC_FILEATTR: { + fileattr_bcast_out_t* fbo = (fileattr_bcast_out_t*) output; + fbo->ret = val; + break; + } + case UNIFYFS_SERVER_BCAST_RPC_LAMINATE: { + laminate_bcast_out_t* lbo = (laminate_bcast_out_t*) output; + lbo->ret = val; + break; + } + case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: { + truncate_bcast_out_t* tbo = (truncate_bcast_out_t*) output; + tbo->ret = val; + break; + } + case UNIFYFS_SERVER_BCAST_RPC_UNLINK: { + unlink_bcast_out_t* ubo = (unlink_bcast_out_t*) output; + ubo->ret = val; + break; + } + default: + LOGERR("invalid collective request type %d", coll_req->req_type); + break; + } +} + +static int coll_get_child_response(coll_request* coll_req, + hg_handle_t chdl) +{ + int ret = UNIFYFS_SUCCESS; + void* out = calloc(1, coll_req->output_sz); + if (NULL == out) { + ret = ENOMEM; } else { - /* get root of tree and global file id to lookup filesize - * record tag calling process wants us to include in our - * later response */ - int gfid = (int) in.gfid; - int32_t num_extents = (int32_t) in.num_extents; - - /* allocate memory for extents */ - struct extent_tree_node* extents; - extents = calloc(num_extents, sizeof(struct extent_tree_node)); - - /* get client address */ - const struct hg_info* info = margo_get_info(handle); - hg_addr_t client_address = info->addr; - - /* expose local bulk buffer */ - hg_size_t buf_size = num_extents * sizeof(struct extent_tree_node); - hg_bulk_t extent_data; - void* datap = extents; - hret = margo_bulk_create(mid, 1, &datap, &buf_size, - HG_BULK_READWRITE, &extent_data); + hg_return_t hret = margo_get_output(chdl, out); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_get_output() failed"); ret = UNIFYFS_ERROR_MARGO; } else { - int i, rc; - hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_bcast_id; - - /* create communication tree structure */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - /* initiate data transfer */ - margo_request bulk_request; - hret = margo_bulk_itransfer(mid, HG_BULK_PULL, client_address, - in.extents, 0, - extent_data, 0, - buf_size, - &bulk_request); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_itransfer() failed"); - ret = UNIFYFS_ERROR_MARGO; + /* update collective return value using child response */ + int child_ret = UNIFYFS_SUCCESS; + void* output = coll_req->output; + + switch (coll_req->req_type) { + case UNIFYFS_SERVER_BCAST_RPC_EXTENTS: { + extent_bcast_out_t* cebo = (extent_bcast_out_t*) out; + extent_bcast_out_t* ebo = (extent_bcast_out_t*) output; + child_ret = cebo->ret; + if (child_ret != UNIFYFS_SUCCESS) { + ebo->ret = child_ret; + } + break; } - - /* update input structure to point to local bulk handle */ - in.extents = extent_data; - - /* allocate memory for request objects - * TODO: possibly get this from memory pool */ - coll_request* requests = - calloc(bcast_tree.child_count, sizeof(*requests)); - if (NULL == requests) { - ret = ENOMEM; - } else { - /* allocate mercury handles for forwarding the request */ - for (i = 0; i < bcast_tree.child_count; i++) { - /* allocate handle for request to this child */ - int child = bcast_tree.child_ranks[i]; - get_request_handle(req_hgid, child, requests+i); + case UNIFYFS_SERVER_BCAST_RPC_FILEATTR: { + fileattr_bcast_out_t* cfbo = (fileattr_bcast_out_t*) out; + fileattr_bcast_out_t* fbo = (fileattr_bcast_out_t*) output; + child_ret = cfbo->ret; + if (child_ret != UNIFYFS_SUCCESS) { + fbo->ret = child_ret; } + break; } - - /* wait for data transfer to finish */ - hret = margo_wait(bulk_request); - if (hret != HG_SUCCESS) { - LOGERR("margo_wait() for bulk transfer failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - LOGDBG("received %d extents (%zu bytes) from %d", - num_extents, (size_t)buf_size, (int)in.root); - - if (NULL != requests) { - /* forward request down the tree */ - for (i = 0; i < bcast_tree.child_count; i++) { - /* invoke filesize request rpc on child */ - rc = forward_request((void*)&in, requests+i); - } + case UNIFYFS_SERVER_BCAST_RPC_LAMINATE: { + laminate_bcast_out_t* clbo = (laminate_bcast_out_t*) out; + laminate_bcast_out_t* lbo = (laminate_bcast_out_t*) output; + child_ret = clbo->ret; + if (child_ret != UNIFYFS_SUCCESS) { + lbo->ret = child_ret; } - - ret = unifyfs_inode_add_extents(gfid, num_extents, extents); - if (ret) { - LOGERR("add of remote extents failed (ret=%d)", ret); - // what do we do now? + break; + } + case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: { + truncate_bcast_out_t* ctbo = (truncate_bcast_out_t*) out; + truncate_bcast_out_t* tbo = (truncate_bcast_out_t*) output; + child_ret = ctbo->ret; + if (child_ret != UNIFYFS_SUCCESS) { + tbo->ret = child_ret; } - LOGDBG("added %d extents (%zu bytes) from %d", - num_extents, (size_t)buf_size, (int)in.root); - - if (NULL != requests) { - /* wait for the requests to finish */ - coll_request* req; - for (i = 0; i < bcast_tree.child_count; i++) { - req = requests + i; - rc = wait_for_request(req); - if (rc == UNIFYFS_SUCCESS) { - /* get the output of the rpc */ - extent_bcast_out_t out; - hret = margo_get_output(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* set return value */ - int child_ret = (int) out.ret; - LOGDBG("MARGOTREE: extbcast child[%d] " - "response: %d", i, child_ret); - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; - } - margo_free_output(req->handle, &out); - } - margo_destroy(req->handle); - } else { - ret = rc; - } + break; + } + case UNIFYFS_SERVER_BCAST_RPC_UNLINK: { + unlink_bcast_out_t* cubo = (unlink_bcast_out_t*) out; + unlink_bcast_out_t* ubo = (unlink_bcast_out_t*) output; + child_ret = cubo->ret; + if (child_ret != UNIFYFS_SUCCESS) { + ubo->ret = child_ret; + } + break; + } + default: + child_ret = UNIFYFS_FAILURE; + LOGERR("invalid collective request type %d", + coll_req->req_type); + break; + } + + ret = child_ret; + + margo_free_output(chdl, out); + } + } + + return ret; +} + +/* Forward the collective request to any children */ +static int collective_finish(coll_request* coll_req) +{ + int ret = UNIFYFS_SUCCESS; + + /* get info for tree */ + int child_count = coll_req->tree.child_count; + + LOGDBG("BCAST_RPC: collective(%p) finish", coll_req); + + if (child_count) { + /* wait for child requests to finish */ + int i, rc; + if (NULL != coll_req->child_reqs) { + margo_request* creq; + hg_handle_t* chdl; + /* MJB TODO - use margo_wait_any() instead of our own loop */ + for (i = 0; i < child_count; i++) { + chdl = coll_req->child_hdls + i; + creq = coll_req->child_reqs + i; + rc = wait_for_child_request(creq); + if (rc == UNIFYFS_SUCCESS) { + /* get the output of the rpc */ + int child_ret = coll_get_child_response(coll_req, *chdl); + LOGDBG("BCAST_RPC: collective(%p) child[%d] resp=%d", + coll_req, i, child_ret); + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; } - free(requests); + } else { + ret = rc; } + margo_destroy(*chdl); } - /* free bulk data handle */ - margo_bulk_free(extent_data); + } else { + LOGERR("child count is %d, but NULL child reqs array", + child_count); + ret = UNIFYFS_FAILURE; + } + } - /* release communication tree resources */ - unifyfs_tree_free(&bcast_tree); + if (NULL != coll_req->output) { + /* send output back to caller */ + hg_return_t hret = margo_respond(coll_req->resp_hdl, coll_req->output); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); } - margo_free_input(handle, &in); + + LOGDBG("BCAST_RPC: collective(%p, op=%d) responded", + coll_req, (int)(coll_req->req_type)); } - /* build our output values */ - extent_bcast_out_t out; - out.ret = ret; + collective_cleanup(coll_req); + + return ret; +} + + +/************************************************************************* + * Broadcast progress via ULT + *************************************************************************/ + +int invoke_bcast_progress_rpc(coll_request* coll_req) +{ + int ret = UNIFYFS_SUCCESS; + + /* get address for local server rank */ + hg_addr_t addr = get_margo_server_address(glb_pmi_rank); + if (HG_ADDR_NULL == addr) { + LOGERR("missing local margo address"); + return UNIFYFS_ERROR_MARGO; + } + + /* get handle to local rpc function */ + hg_handle_t handle; + hg_id_t hgid = unifyfsd_rpc_context->rpcs.bcast_progress_id; + hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, addr, + hgid, &handle); + if (hret != HG_SUCCESS) { + LOGERR("failed to get handle for bcast progress"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* call rpc function */ + bcast_progress_in_t in; + in.coll_req = (hg_ptr_t) coll_req; + hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("failed to forward bcast progress for coll(%p)", coll_req); + ret = UNIFYFS_ERROR_MARGO; + } + } + + return ret; +} + +/* generic broadcast rpc progression handler */ +static void bcast_progress_rpc(hg_handle_t handle) +{ + /* assume we'll succeed */ + int32_t ret = UNIFYFS_SUCCESS; + + bcast_progress_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* call collective_finish() to progress bcast operation */ + coll_request* coll = (coll_request*) in.coll_req; + LOGDBG("BCAST_RPC: bcast progress collective(%p)", coll); + ret = collective_finish(coll); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("collective_finish() failed for coll_req(%p) (rc=%d)", + coll, ret); + } + } - /* send output back to caller */ + /* finish rpc */ + bcast_progress_out_t out; + out.ret = ret; hret = margo_respond(handle, &out); if (hret != HG_SUCCESS) { LOGERR("margo_respond() failed"); } - LOGDBG("MARGOTREE: extent bcast rpc handler - responded"); - /* free margo resources */ + margo_free_input(handle, &in); margo_destroy(handle); } -DEFINE_MARGO_RPC_HANDLER(extent_bcast_rpc) +DEFINE_MARGO_RPC_HANDLER(bcast_progress_rpc) -/* Forward the extent broadcast to all children and wait for responses */ -static -int extent_bcast_forward(const unifyfs_tree_t* broadcast_tree, - extent_bcast_in_t* in) -{ - LOGDBG("MARGOTREE: extent bcast forward"); - /* get info for tree */ - int child_count = broadcast_tree->child_count; - if (0 == child_count) { - return UNIFYFS_SUCCESS; - } +/************************************************************************* + * Broadcast file extents metadata + *************************************************************************/ - int* child_ranks = broadcast_tree->child_ranks; +/* file extents metadata broadcast rpc handler */ +static void extent_bcast_rpc(hg_handle_t handle) +{ + LOGDBG("BCAST_RPC: extents handler"); - /* allocate memory for request objects - * TODO: possibly get this from memory pool */ - coll_request* requests = calloc(child_count, - sizeof(*requests)); + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; - /* forward request down the tree */ - int i, rc, ret; - coll_request* req; - hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_bcast_id; - for (i = 0; i < child_count; i++) { - req = requests + i; - - /* allocate handle */ - rc = get_request_handle(req_hgid, child_ranks[i], req); - if (rc == UNIFYFS_SUCCESS) { - /* invoke extbcast request rpc on child */ - rc = forward_request((void*)in, req); + coll_request* coll = NULL; + server_rpc_req_t* req = calloc(1, sizeof(*req)); + extent_bcast_in_t* in = calloc(1, sizeof(*in)); + extent_bcast_out_t* out = calloc(1, sizeof(*out)); + if ((NULL == req) || (NULL == in) || (NULL == out)) { + ret = ENOMEM; + } else { + /* get input params */ + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; } else { - ret = rc; - } - } - - /* wait for the requests to finish */ - for (i = 0; i < child_count; i++) { - req = requests + i; - rc = wait_for_request(req); - if (rc == UNIFYFS_SUCCESS) { - LOGDBG("MARGOTREE: extent bcast - child[%d] responded", i); - /* get the output of the rpc */ - extent_bcast_out_t out; - hg_return_t hret = margo_get_output(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + size_t num_extents = (size_t) in->num_extents; + size_t bulk_sz = num_extents * sizeof(struct extent_tree_node); + hg_bulk_t local_bulk = HG_BULK_NULL; + void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, + bulk_sz, &local_bulk); + if (NULL == extents_buf) { + LOGERR("failed to get bulk extents"); ret = UNIFYFS_ERROR_MARGO; } else { - /* set return value */ - int child_ret = out.ret; - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.extent_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_EXTENTS; + coll = collective_create(rpc, handle, op_hgid, (int)(in->root), + (void*)in, (void*)out, sizeof(*out), + in->extents, local_bulk, extents_buf); + if (NULL == coll) { + ret = ENOMEM; + } else { + /* update input structure that we are forwarding to point + * to our local bulk buffer. will be restore on cleanup. */ + in->extents = local_bulk; + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + req->req_type = rpc; + req->coll = coll; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = extents_buf; + req->bulk_sz = bulk_sz; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit coll request to svcmgr"); + } + } } - margo_free_output(req->handle, &out); } - margo_destroy(req->handle); - } else { - ret = rc; } } - return ret; + if (ret != UNIFYFS_SUCCESS) { + /* report failure back to caller */ + extent_bcast_out_t ebo; + ebo.ret = (int32_t)ret; + hg_return_t hret = margo_respond(handle, &ebo); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + if (NULL != coll) { + collective_cleanup(coll); + } else { + margo_destroy(handle); + } + } } +DEFINE_MARGO_RPC_HANDLER(extent_bcast_rpc) /* Execute broadcast tree for extent metadata */ -int unifyfs_invoke_broadcast_extents_rpc(int gfid, unsigned int len, - struct extent_tree_node* extents) +int unifyfs_invoke_broadcast_extents_rpc(int gfid) { /* assuming success */ int ret = UNIFYFS_SUCCESS; - /* create communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, - UNIFYFS_BCAST_K_ARY, &bcast_tree); + LOGDBG("BCAST_RPC: starting extents for gfid=%d", gfid); - hg_size_t num_extents = len; - hg_size_t buf_size = num_extents * sizeof(*extents); + size_t n_extents; + struct extent_tree_node* extents; + ret = unifyfs_inode_get_extents(gfid, &n_extents, &extents); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to get extents for gfid=%d", gfid); + return ret; + } - LOGDBG("broadcasting %u extents for gfid=%d)", - len, gfid); + if (0 == n_extents) { + /* nothing to broadcast */ + return UNIFYFS_SUCCESS; + } /* create bulk data structure containing the extents * NOTE: bulk data is always read only at the root of the broadcast tree */ + hg_size_t buf_size = n_extents * sizeof(*extents); hg_bulk_t extents_bulk; - void* datap = (void*) extents; + void* buf = (void*) extents; hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, - &datap, &buf_size, + &buf, &buf_size, HG_BULK_READ_ONLY, &extents_bulk); if (hret != HG_SUCCESS) { LOGERR("margo_bulk_create() failed"); ret = UNIFYFS_ERROR_MARGO; } else { - /* fill in input struct */ - extent_bcast_in_t in; - in.root = (int32_t)glb_pmi_rank; - in.gfid = gfid; - in.num_extents = num_extents; - in.extents = extents_bulk; - - extent_bcast_forward(&bcast_tree, &in); - - /* free bulk data handle */ - margo_bulk_free(extents_bulk); + coll_request* coll = NULL; + extent_bcast_in_t* in = calloc(1, sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + /* set input params */ + in->root = (int32_t) glb_pmi_rank; + in->gfid = (int32_t) gfid; + in->extents = extents_bulk; + in->num_extents = (int32_t) n_extents; + + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.extent_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_EXTENTS; + coll = collective_create(rpc, HG_HANDLE_NULL, op_hgid, + glb_pmi_rank, (void*)in, + NULL, sizeof(extent_bcast_out_t), + HG_BULK_NULL, extents_bulk, buf); + if (NULL == coll) { + ret = ENOMEM; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + ret = invoke_bcast_progress_rpc(coll); + } + } + } } - /* free tree resources and passed extents */ - unifyfs_tree_free(&bcast_tree); - free(extents); + if (ret != UNIFYFS_SUCCESS) { + if (NULL != extents) { + free(extents); + } + } return ret; } @@ -371,256 +639,85 @@ int unifyfs_invoke_broadcast_extents_rpc(int gfid, unsigned int len, /* file extents metadata broadcast rpc handler */ static void laminate_bcast_rpc(hg_handle_t handle) { - LOGDBG("MARGOTREE: laminate bcast handler"); - - int32_t ret; + LOGDBG("BCAST_RPC: laminate handler"); - /* get instance id */ - margo_instance_id mid = margo_hg_handle_get_instance(handle); + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; - /* get input params */ - laminate_bcast_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + coll_request* coll = NULL; + server_rpc_req_t* req = calloc(1, sizeof(*req)); + laminate_bcast_in_t* in = calloc(1, sizeof(*in)); + laminate_bcast_out_t* out = calloc(1, sizeof(*out)); + if ((NULL == req) || (NULL == in) || (NULL == out)) { + ret = ENOMEM; } else { - /* get root of tree and global file id to lookup filesize - * record tag calling process wants us to include in our - * later response */ - int gfid = (int) in.gfid; - size_t num_extents = (size_t) in.num_extents; - unifyfs_file_attr_t* fattr = &(in.attr); - - /* allocate memory for extents */ - struct extent_tree_node* extents; - extents = calloc(num_extents, sizeof(struct extent_tree_node)); - - /* get client address */ - const struct hg_info* info = margo_get_info(handle); - hg_addr_t client_address = info->addr; - - /* expose local bulk buffer */ - hg_size_t buf_size = num_extents * sizeof(struct extent_tree_node); - hg_bulk_t extent_data; - void* datap = extents; - hret = margo_bulk_create(mid, 1, &datap, &buf_size, - HG_BULK_READWRITE, &extent_data); + /* get input params */ + hg_return_t hret = margo_get_input(handle, in); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_get_input() failed"); ret = UNIFYFS_ERROR_MARGO; } else { - int i, rc; - hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.laminate_bcast_id; - - /* create communication tree structure */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - /* initiate data transfer */ - margo_request bulk_request; - hret = margo_bulk_itransfer(mid, HG_BULK_PULL, - client_address, in.extents, 0, - extent_data, 0, - buf_size, &bulk_request); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_itransfer() failed"); + size_t n_extents = (size_t) in->num_extents; + size_t bulk_sz = n_extents * sizeof(struct extent_tree_node); + hg_bulk_t local_bulk = HG_BULK_NULL; + void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, + bulk_sz, &local_bulk); + if (NULL == extents_buf) { + LOGERR("failed to get bulk extents"); ret = UNIFYFS_ERROR_MARGO; - } - - /* allocate memory for request objects - * TODO: possibly get this from memory pool */ - coll_request* requests = - calloc(bcast_tree.child_count, sizeof(*requests)); - if (NULL == requests) { - ret = ENOMEM; } else { - /* allocate mercury handles for forwarding the request */ - for (i = 0; i < bcast_tree.child_count; i++) { - /* allocate handle for request to this child */ - int child = bcast_tree.child_ranks[i]; - get_request_handle(req_hgid, child, requests+i); - } - } - - /* wait for data transfer to finish */ - hret = margo_wait(bulk_request); - if (hret != HG_SUCCESS) { - LOGERR("margo_wait() for bulk transfer failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - LOGINFO("laminating gfid=%d, received %zu extents from %d", - gfid, num_extents, (int)in.root); - - if (NULL != requests) { - /* update input structure to point to local bulk handle */ - in.extents = extent_data; - - /* forward request down the tree */ - for (i = 0; i < bcast_tree.child_count; i++) { - /* invoke filesize request rpc on child */ - rc = forward_request((void*)&in, requests+i); - } - } - - /* update inode file attributes. first check to make sure inode - * for the gfid exists. if it doesn't, create it with given - * attrs. otherwise, just do a metadata update. */ - unifyfs_file_attr_t existing_fattr; - ret = unifyfs_inode_metaget(gfid, &existing_fattr); - if (ret == ENOENT) { - /* create with is_laminated=0 so extents can be added */ - fattr->is_laminated = 0; - ret = unifyfs_inode_create(gfid, fattr); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("inode create failed (ret=%d)", ret); - } - fattr->is_laminated = 1; - } - - /* add the final set of extents */ - ret = unifyfs_inode_add_extents(gfid, num_extents, extents); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("laminate extents update failed (ret=%d)", ret); + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.laminate_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_LAMINATE; + coll = collective_create(rpc, handle, op_hgid, (int)(in->root), + (void*)in, (void*)out, sizeof(*out), + in->extents, local_bulk, extents_buf); + if (NULL == coll) { + ret = ENOMEM; } else { - ret = unifyfs_inode_metaset(gfid, - UNIFYFS_FILE_ATTR_OP_LAMINATE, - fattr); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("laminate attrs update failed (ret=%d)", ret); - } - } - - if (NULL != requests) { - /* wait for the requests to finish */ - coll_request* req; - for (i = 0; i < bcast_tree.child_count; i++) { - req = requests + i; - rc = wait_for_request(req); - if (rc == UNIFYFS_SUCCESS) { - /* get the output of the rpc */ - laminate_bcast_out_t out; - hret = margo_get_output(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* set return value */ - int child_ret = (int) out.ret; - LOGDBG("MARGOTREE: laminate child[%d] " - "response: %d", i, child_ret); - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; - } - margo_free_output(req->handle, &out); - } - margo_destroy(req->handle); - } else { - ret = rc; + /* update input structure that we are forwarding to point + * to our local bulk buffer. will be restore on cleanup. */ + in->extents = local_bulk; + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + req->req_type = rpc; + req->coll = coll; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = extents_buf; + req->bulk_sz = bulk_sz; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit coll request to svcmgr"); } } - free(requests); } } - /* free bulk data handle */ - margo_bulk_free(extent_data); - - /* release communication tree resources */ - unifyfs_tree_free(&bcast_tree); } - margo_free_input(handle, &in); - } - - /* build our output values */ - laminate_bcast_out_t out; - out.ret = ret; - - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } - - LOGDBG("MARGOTREE: laminate bcast handler - responded"); - - /* free margo resources */ - margo_destroy(handle); -} -DEFINE_MARGO_RPC_HANDLER(laminate_bcast_rpc) - -/* Forward the laminate broadcast to all children and wait for responses */ -static -int laminate_bcast_forward(const unifyfs_tree_t* broadcast_tree, - laminate_bcast_in_t* in) -{ - /* get info for tree */ - int* child_ranks = broadcast_tree->child_ranks; - int child_count = broadcast_tree->child_count; - if (0 == child_count) { - return UNIFYFS_SUCCESS; } - int gfid = (int) in->gfid; - LOGDBG("MARGOTREE: laminate bcast forward for gfid=%d", gfid); - - /* allocate memory for request objects - * TODO: possibly get this from memory pool */ - coll_request* requests = calloc(child_count, - sizeof(*requests)); - - /* forward request down the tree */ - int i, rc, ret; - coll_request* req; - hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.laminate_bcast_id; - for (i = 0; i < child_count; i++) { - req = requests + i; - - /* allocate handle */ - rc = get_request_handle(req_hgid, child_ranks[i], req); - if (rc == UNIFYFS_SUCCESS) { - /* invoke extbcast request rpc on child */ - rc = forward_request((void*)in, req); - } else { - ret = rc; + if (ret != UNIFYFS_SUCCESS) { + /* report failure back to caller */ + laminate_bcast_out_t lbo; + lbo.ret = (int32_t)ret; + hg_return_t hret = margo_respond(handle, &lbo); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); } - } - /* wait for the requests to finish */ - for (i = 0; i < child_count; i++) { - req = requests + i; - rc = wait_for_request(req); - if (rc == UNIFYFS_SUCCESS) { - LOGDBG("MARGOTREE: laminate bcast - child[%d] responded", i); - /* get the output of the rpc */ - laminate_bcast_out_t out; - hg_return_t hret = margo_get_output(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* set return value */ - int child_ret = out.ret; - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; - } - margo_free_output(req->handle, &out); - } - margo_destroy(req->handle); + if (NULL != coll) { + collective_cleanup(coll); } else { - ret = rc; + margo_destroy(handle); } } - - return ret; } +DEFINE_MARGO_RPC_HANDLER(laminate_bcast_rpc) /* Execute broadcast tree for attributes and extent metadata due to laminate */ int unifyfs_invoke_broadcast_laminate(int gfid) { - int ret; - - LOGDBG("broadcasting laminate for gfid=%d", gfid); + /* assuming success */ + int ret = UNIFYFS_SUCCESS; /* get attributes and extents metadata */ unifyfs_file_attr_t attrs; @@ -630,6 +727,14 @@ int unifyfs_invoke_broadcast_laminate(int gfid) return ret; } + if (!attrs.is_shared) { + /* no need to broadcast for private files */ + LOGDBG("gfid=%d is private, not broadcasting", gfid); + return UNIFYFS_SUCCESS; + } + + LOGDBG("BCAST_RPC: starting laminate for gfid=%d", gfid); + size_t n_extents; struct extent_tree_node* extents; ret = unifyfs_inode_get_extents(gfid, &n_extents, &extents); @@ -640,214 +745,161 @@ int unifyfs_invoke_broadcast_laminate(int gfid) /* create bulk data structure containing the extents * NOTE: bulk data is always read only at the root of the broadcast tree */ - hg_size_t num_extents = n_extents; - hg_size_t buf_size = num_extents * sizeof(*extents); - hg_bulk_t extents_bulk; - void* datap = (void*) extents; - hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, - &datap, &buf_size, - HG_BULK_READ_ONLY, &extents_bulk); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* create broadcast communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - /* fill input struct and forward */ - laminate_bcast_in_t in; - in.root = (int32_t) glb_pmi_rank; - in.gfid = (int32_t) gfid; - in.attr = attrs; - in.num_extents = (int32_t) num_extents; - in.extents = extents_bulk; - laminate_bcast_forward(&bcast_tree, &in); - - /* free tree resources */ - unifyfs_tree_free(&bcast_tree); - - /* free bulk data handle */ - margo_bulk_free(extents_bulk); - } - - /* free extents array */ - free(extents); - - return ret; -} - - -/************************************************************************* - * Broadcast file truncation - *************************************************************************/ - -/* Forward the truncate broadcast to all children and wait for responses */ -static -int truncate_bcast_forward(const unifyfs_tree_t* broadcast_tree, - truncate_bcast_in_t* in) -{ - int i, rc, ret; - int gfid = (int) in->gfid; - size_t fsize = (size_t) in->filesize; - LOGDBG("MARGOTREE: truncate bcast forward - gfid=%d size=%zu", - gfid, fsize); - - /* apply truncation to local file state */ - ret = unifyfs_inode_truncate(gfid, (unsigned long)fsize); - if (ret != UNIFYFS_SUCCESS) { - /* owner is root of broadcast tree */ - int is_owner = ((int)(in->root) == glb_pmi_rank); - if ((ret == ENOENT) && !is_owner) { - /* it's ok if inode doesn't exist at non-owners */ - ret = UNIFYFS_SUCCESS; - } else { - LOGERR("unifyfs_inode_truncate(gfid=%d, size=%zu) failed - ret=%d", - gfid, fsize, ret); - goto out; + hg_bulk_t extents_bulk = HG_BULK_NULL; + if (n_extents) { + void* buf = (void*) extents; + hg_size_t buf_size = n_extents * sizeof(*extents); + hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, + &buf, &buf_size, + HG_BULK_READ_ONLY, &extents_bulk); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + free(buf); + return UNIFYFS_ERROR_MARGO; } } - /* get info for tree */ - int child_count = broadcast_tree->child_count; - int* child_ranks = broadcast_tree->child_ranks; - if (child_count > 0) { - LOGDBG("MARGOTREE: sending truncate to %d children", - child_count); - - /* allocate memory for request objects - * TODO: possibly get this from memory pool */ - coll_request* requests = calloc(child_count, - sizeof(coll_request)); - if (!requests) { + coll_request* coll = NULL; + laminate_bcast_in_t* in = calloc(1, sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + /* set input params */ + in->root = (int32_t) glb_pmi_rank; + in->gfid = (int32_t) gfid; + in->attr = attrs; + in->extents = extents_bulk; + in->num_extents = (int32_t) n_extents; + + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.laminate_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_LAMINATE; + coll = collective_create(rpc, HG_HANDLE_NULL, op_hgid, + glb_pmi_rank, (void*)in, + NULL, sizeof(laminate_bcast_out_t), + HG_BULK_NULL, extents_bulk, extents); + if (NULL == coll) { ret = ENOMEM; - goto out; - } - - /* forward request down the tree */ - coll_request* req; - hg_id_t hgid = unifyfsd_rpc_context->rpcs.truncate_bcast_id; - for (i = 0; i < child_count; i++) { - req = requests + i; - - /* get rank of this child */ - int child = child_ranks[i]; - LOGDBG("MARGOTREE: truncate child[%d] is rank %d - %s", - i, child, glb_servers[child].margo_svr_addr_str); - - /* allocate handle */ - rc = get_request_handle(hgid, child, req); - if (rc == UNIFYFS_SUCCESS) { - /* invoke truncate request rpc on child */ - rc = forward_request((void*)in, req); - } else { - ret = rc; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + ret = invoke_bcast_progress_rpc(coll); } } + } - /* wait for the requests to finish */ - for (i = 0; i < child_count; i++) { - req = requests + i; - rc = wait_for_request(req); - if (rc == UNIFYFS_SUCCESS) { - /* get the output of the rpc */ - truncate_bcast_out_t out; - hg_return_t hret = margo_get_output(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* set return value */ - int child_ret = out.ret; - LOGDBG("MARGOTREE: truncate child[%d] response: ret=%d", - i, child_ret); - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; - } - margo_free_output(req->handle, &out); - } - margo_destroy(req->handle); - } else { - ret = rc; - } + if (ret != UNIFYFS_SUCCESS) { + if (NULL != extents) { + free(extents); } - - free(requests); } -out: return ret; } + +/************************************************************************* + * Broadcast file truncation + *************************************************************************/ + /* truncate broadcast rpc handler */ static void truncate_bcast_rpc(hg_handle_t handle) { - LOGDBG("MARGOTREE: truncate bcast handler"); + LOGDBG("BCAST_RPC: truncate handler"); /* assume we'll succeed */ - int32_t ret = UNIFYFS_SUCCESS; + int ret = UNIFYFS_SUCCESS; - /* get input params */ - truncate_bcast_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + coll_request* coll = NULL; + server_rpc_req_t* req = calloc(1, sizeof(*req)); + truncate_bcast_in_t* in = calloc(1, sizeof(*in)); + truncate_bcast_out_t* out = calloc(1, sizeof(*out)); + if ((NULL == req) || (NULL == in) || (NULL == out)) { + ret = ENOMEM; } else { - /* create communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - ret = truncate_bcast_forward(&bcast_tree, &in); - - unifyfs_tree_free(&bcast_tree); - margo_free_input(handle, &in); + /* get input params */ + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.truncate_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_TRUNCATE; + coll = collective_create(rpc, handle, op_hgid, (int)(in->root), + (void*)in, (void*)out, sizeof(*out), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + req->req_type = rpc; + req->coll = coll; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit coll request to svcmgr"); + } + } + } + } } - /* build our output values */ - truncate_bcast_out_t out; - out.ret = ret; + if (ret != UNIFYFS_SUCCESS) { + /* report failure back to caller */ + truncate_bcast_out_t tbo; + tbo.ret = (int32_t)ret; + hg_return_t hret = margo_respond(handle, &tbo); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + if (NULL != coll) { + collective_cleanup(coll); + } else { + margo_destroy(handle); + } } - - /* free margo resources */ - margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(truncate_bcast_rpc) /* Execute broadcast tree for file truncate */ -int unifyfs_invoke_broadcast_truncate(int gfid, size_t filesize) +int unifyfs_invoke_broadcast_truncate(int gfid, + size_t filesize) { - LOGDBG("broadcasting truncate for gfid=%d filesize=%zu", - gfid, filesize); + LOGDBG("BCAST_RPC: starting truncate(filesize=%zu) for gfid=%d", + filesize, gfid); /* assuming success */ int ret = UNIFYFS_SUCCESS; - /* create communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - /* fill in input struct */ - truncate_bcast_in_t in; - in.root = (int32_t) glb_pmi_rank; - in.gfid = gfid; - in.filesize = filesize; - - ret = truncate_bcast_forward(&bcast_tree, &in); - if (ret) { - LOGERR("truncate_bcast_forward failed: (ret=%d)", ret); + coll_request* coll = NULL; + truncate_bcast_in_t* in = calloc(1, sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + /* get input params */ + in->root = (int32_t) glb_pmi_rank; + in->gfid = gfid; + in->filesize = filesize; + + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.truncate_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_TRUNCATE; + coll = collective_create(rpc, HG_HANDLE_NULL, op_hgid, + glb_pmi_rank, (void*)in, + NULL, sizeof(truncate_bcast_out_t), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + ret = invoke_bcast_progress_rpc(coll); + } + } } - - unifyfs_tree_free(&bcast_tree); - return ret; } @@ -855,130 +907,67 @@ int unifyfs_invoke_broadcast_truncate(int gfid, size_t filesize) * Broadcast updates to file attributes *************************************************************************/ -/* Forward the fileattr broadcast to all children and wait for responses */ -static -int fileattr_bcast_forward(const unifyfs_tree_t* broadcast_tree, - fileattr_bcast_in_t* in) +/* file attributes broadcast rpc handler */ +static void fileattr_bcast_rpc(hg_handle_t handle) { - int i, rc, ret; - int gfid = (int) in->gfid; - - LOGDBG("MARGOTREE: fileattr bcast forward (gfid=%d)", gfid); + LOGDBG("BCAST_RPC: fileattr handler"); - /* set local metadata for target file */ - ret = unifyfs_inode_metaset(gfid, in->attrop, &in->attr); - if (ret) { - goto out; - } - - /* get info for tree */ - int child_count = broadcast_tree->child_count; - int* child_ranks = broadcast_tree->child_ranks; - if (child_count > 0) { - LOGDBG("MARGOTREE: %d: sending metaset to %d children", - glb_pmi_rank, child_count); - - /* allocate memory for request objects - * TODO: possibly get this from memory pool */ - coll_request* requests = calloc(child_count, - sizeof(coll_request)); - if (!requests) { - ret = ENOMEM; - goto out; - } + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; - /* forward request down the tree */ - coll_request* req; - hg_id_t hgid = unifyfsd_rpc_context->rpcs.fileattr_bcast_id; - for (i = 0; i < child_count; i++) { - req = requests + i; - - /* get rank of this child */ - int child = child_ranks[i]; - LOGDBG("MARGOTREE: metaset child[%d] is rank %d - %s", - i, child, glb_servers[child].margo_svr_addr_str); - - /* allocate handle */ - rc = get_request_handle(hgid, child, req); - if (rc == UNIFYFS_SUCCESS) { - /* invoke metaset request rpc on child */ - rc = forward_request((void*)in, req); + coll_request* coll = NULL; + server_rpc_req_t* req = calloc(1, sizeof(*req)); + fileattr_bcast_in_t* in = calloc(1, sizeof(*in)); + fileattr_bcast_out_t* out = calloc(1, sizeof(*out)); + if ((NULL == req) || (NULL == in) || (NULL == out)) { + ret = ENOMEM; + } else { + /* get input params */ + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.fileattr_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_FILEATTR; + coll = collective_create(rpc, handle, op_hgid, (int)(in->root), + (void*)in, (void*)out, sizeof(*out), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; } else { - ret = rc; - } - } - - /* wait for the requests to finish */ - for (i = 0; i < child_count; i++) { - req = requests + i; - rc = wait_for_request(req); - if (rc == UNIFYFS_SUCCESS) { - /* get the output of the rpc */ - fileattr_bcast_out_t out; - hg_return_t hret = margo_get_output(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* set return value */ - int child_ret = out.ret; - LOGDBG("MARGOTREE: metaset child[%d] response: ret=%d", - i, child_ret); - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + req->req_type = rpc; + req->coll = coll; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit coll request to svcmgr"); } - margo_free_output(req->handle, &out); } - margo_destroy(req->handle); - } else { - ret = rc; } } - - free(requests); - } -out: - return ret; -} - -/* file attributes broadcast rpc handler */ -static void fileattr_bcast_rpc(hg_handle_t handle) -{ - LOGDBG("MARGOTREE: fileattr bcast handler"); - - /* assume we'll succeed */ - int32_t ret = UNIFYFS_SUCCESS; - - /* get input params */ - fileattr_bcast_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* create communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - ret = fileattr_bcast_forward(&bcast_tree, &in); - - unifyfs_tree_free(&bcast_tree); - margo_free_input(handle, &in); } - /* build our output values */ - fileattr_bcast_out_t out; - out.ret = ret; + if (ret != UNIFYFS_SUCCESS) { + /* report failure back to caller */ + fileattr_bcast_out_t fbo; + fbo.ret = (int32_t)ret; + hg_return_t hret = margo_respond(handle, &fbo); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + if (NULL != coll) { + collective_cleanup(coll); + } else { + margo_destroy(handle); + } } - - /* free margo resources */ - margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(fileattr_bcast_rpc) @@ -987,27 +976,37 @@ int unifyfs_invoke_broadcast_fileattr(int gfid, int attr_op, unifyfs_file_attr_t* fattr) { - LOGDBG("broadcasting file attributes for gfid=%d", gfid); - - /* create communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - /* fill in input struct */ - fileattr_bcast_in_t in; - in.root = (int32_t) glb_pmi_rank; - in.gfid = gfid; - in.attrop = attr_op; - in.attr = *fattr; - - int ret = fileattr_bcast_forward(&bcast_tree, &in); - if (ret) { - LOGERR("fileattr_bcast_forward failed: (ret=%d)", ret); - } + LOGDBG("BCAST_RPC: starting metaset(op=%d) for gfid=%d", attr_op, gfid); - unifyfs_tree_free(&bcast_tree); + /* assuming success */ + int ret = UNIFYFS_SUCCESS; + coll_request* coll = NULL; + fileattr_bcast_in_t* in = calloc(1, sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + /* get input params */ + in->root = (int32_t) glb_pmi_rank; + in->gfid = (int32_t) gfid; + in->attrop = (int32_t) attr_op; + in->attr = *fattr; + + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.fileattr_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_FILEATTR; + coll = collective_create(rpc, HG_HANDLE_NULL, op_hgid, + glb_pmi_rank, (void*)in, + NULL, sizeof(fileattr_bcast_out_t), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + ret = invoke_bcast_progress_rpc(coll); + } + } + } return ret; } @@ -1015,154 +1014,101 @@ int unifyfs_invoke_broadcast_fileattr(int gfid, * Broadcast file unlink *************************************************************************/ -/* Forward the unlink broadcast to all children and wait for responses */ -static -int unlink_bcast_forward(const unifyfs_tree_t* broadcast_tree, - unlink_bcast_in_t* in) +/* unlink broacast rpc handler */ +static void unlink_bcast_rpc(hg_handle_t handle) { - int i, rc, ret; - int gfid = (int) in->gfid; - - LOGDBG("MARGOTREE: unlink bcast forward (gfid=%d)", gfid); - - /* remove local file metadata */ - ret = unifyfs_inode_unlink(in->gfid); - if (ret) { - goto out; - } + LOGDBG("BCAST_RPC: unlink handler"); - /* get info for tree */ - int child_count = broadcast_tree->child_count; - int* child_ranks = broadcast_tree->child_ranks; - if (child_count > 0) { - LOGDBG("MARGOTREE: %d: sending unlink to %d children", - glb_pmi_rank, child_count); - - /* allocate memory for request objects - * TODO: possibly get this from memory pool */ - coll_request* requests = calloc(child_count, - sizeof(coll_request)); - if (!requests) { - ret = ENOMEM; - goto out; - } + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; - /* forward request down the tree */ - coll_request* req; - hg_id_t hgid = unifyfsd_rpc_context->rpcs.unlink_bcast_id; - for (i = 0; i < child_count; i++) { - req = requests + i; - - /* get rank of this child */ - int child = child_ranks[i]; - LOGDBG("MARGOTREE: unlink child[%d] is rank %d - %s", - i, child, glb_servers[child].margo_svr_addr_str); - - /* allocate handle */ - rc = get_request_handle(hgid, child, req); - if (rc == UNIFYFS_SUCCESS) { - /* invoke unlink request rpc on child */ - rc = forward_request((void*)in, req); + coll_request* coll = NULL; + server_rpc_req_t* req = calloc(1, sizeof(*req)); + unlink_bcast_in_t* in = calloc(1, sizeof(*in)); + unlink_bcast_out_t* out = calloc(1, sizeof(*out)); + if ((NULL == req) || (NULL == in) || (NULL == out)) { + ret = ENOMEM; + } else { + /* get input params */ + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.unlink_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_UNLINK; + coll = collective_create(rpc, handle, op_hgid, (int)(in->root), + (void*)in, (void*)out, sizeof(*out), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; } else { - ret = rc; - } - } - - /* wait for the requests to finish */ - for (i = 0; i < child_count; i++) { - req = requests + i; - rc = wait_for_request(req); - if (rc == UNIFYFS_SUCCESS) { - /* get the output of the rpc */ - unlink_bcast_out_t out; - hg_return_t hret = margo_get_output(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* set return value */ - int child_ret = out.ret; - LOGDBG("MARGOTREE: unlink child[%d] response: ret=%d", - i, child_ret); - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + req->req_type = rpc; + req->coll = coll; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit coll request to svcmgr"); } - margo_free_output(req->handle, &out); } - margo_destroy(req->handle); - } else { - ret = rc; } } - - free(requests); } -out: - return ret; -} - -/* unlink broacast rpc handler */ -static void unlink_bcast_rpc(hg_handle_t handle) -{ - LOGDBG("MARGOTREE: unlink bcast handler"); - - int32_t ret; - - /* get input params */ - unlink_bcast_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* create communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, in.root, - UNIFYFS_BCAST_K_ARY, &bcast_tree); - - ret = unlink_bcast_forward(&bcast_tree, &in); - - unifyfs_tree_free(&bcast_tree); - margo_free_input(handle, &in); - } - - /* build our output values */ - unlink_bcast_out_t out; - out.ret = ret; + if (ret != UNIFYFS_SUCCESS) { + /* report failure back to caller */ + unlink_bcast_out_t ubo; + ubo.ret = (int32_t)ret; + hg_return_t hret = margo_respond(handle, &ubo); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + if (NULL != coll) { + collective_cleanup(coll); + } else { + margo_destroy(handle); + } } - - /* free margo resources */ - margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(unlink_bcast_rpc) /* Execute broadcast tree for file unlink */ int unifyfs_invoke_broadcast_unlink(int gfid) { - LOGDBG("broadcasting unlink for gfid=%d", gfid); - - /* create communication tree */ - unifyfs_tree_t bcast_tree; - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, glb_pmi_rank, - UNIFYFS_BCAST_K_ARY, &bcast_tree); + LOGDBG("BCAST_RPC: starting unlink for gfid=%d", gfid); - /* fill in input struct */ - unlink_bcast_in_t in; - in.root = (int32_t) glb_pmi_rank; - in.gfid = (int32_t) gfid; + /* assuming success */ + int ret = UNIFYFS_SUCCESS; - int ret = unlink_bcast_forward(&bcast_tree, &in); - if (ret) { - LOGERR("unlink_bcast_forward failed: (ret=%d)", ret); + coll_request* coll = NULL; + unlink_bcast_in_t* in = calloc(1, sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + /* get input params */ + in->root = (int32_t) glb_pmi_rank; + in->gfid = gfid; + + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.unlink_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_UNLINK; + coll = collective_create(rpc, HG_HANDLE_NULL, op_hgid, + glb_pmi_rank, (void*)in, + NULL, sizeof(unlink_bcast_out_t), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + ret = invoke_bcast_progress_rpc(coll); + } + } } - - unifyfs_tree_free(&bcast_tree); - return ret; } diff --git a/server/src/unifyfs_group_rpc.h b/server/src/unifyfs_group_rpc.h index 80db9f0d9..fac96912b 100644 --- a/server/src/unifyfs_group_rpc.h +++ b/server/src/unifyfs_group_rpc.h @@ -15,11 +15,41 @@ #ifndef _UNIFYFS_GROUP_RPC_H #define _UNIFYFS_GROUP_RPC_H -#include "unifyfs_tree.h" +#include "unifyfs_global.h" #include "unifyfs_inode.h" +#include "unifyfs_service_manager.h" +#include "unifyfs_tree.h" +#include "margo_server.h" /* Collective Server RPCs */ +/* server collective (coll) request state structure */ +typedef struct { + server_rpc_e req_type; + unifyfs_tree_t tree; + hg_handle_t resp_hdl; + size_t output_sz; /* size of output struct */ + void* output; /* output struct (type is dependent on rpc) */ + void* input; + void* bulk_buf; /* allocated buffer for bulk data */ + hg_bulk_t bulk_in; + hg_bulk_t bulk_forward; + margo_request* child_reqs; + hg_handle_t* child_hdls; +} coll_request; + +/* set collective output return value to local result value */ +void collective_set_local_retval(coll_request* coll_req, int val); + +/** + * @brief Progress an ongoing broadcast tree operation + * + * @param coll_req the broadcast collective + * + * @return success|failure + */ +int invoke_bcast_progress_rpc(coll_request* coll_req); + /** * @brief Broadcast file extents metadata to all servers * @@ -63,7 +93,8 @@ int unifyfs_invoke_broadcast_laminate(int gfid); * * @return success|failure */ -int unifyfs_invoke_broadcast_truncate(int gfid, size_t filesize); +int unifyfs_invoke_broadcast_truncate(int gfid, + size_t filesize); /** * @brief Unlink file at all servers diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c index e22e7db2b..a49aae931 100644 --- a/server/src/unifyfs_inode.c +++ b/server/src/unifyfs_inode.c @@ -28,13 +28,23 @@ static inline struct unifyfs_inode* unifyfs_inode_alloc(int gfid, unifyfs_file_attr_t* attr) { struct unifyfs_inode* ino = calloc(1, sizeof(*ino)); - - if (ino) { + if (NULL != ino) { + struct extent_tree* tree = calloc(1, sizeof(*tree)); + if (NULL == tree) { + LOGERR("failed to allocate memory for inode extent tree"); + free(ino); + return NULL; + } + extent_tree_init(tree); + ino->extents = tree; ino->gfid = gfid; ino->attr = *attr; ino->attr.filename = strdup(attr->filename); - pthread_rwlock_init(&ino->rwlock, NULL); + + pthread_rwlock_init(&(ino->rwlock), NULL); ABT_mutex_create(&(ino->abt_sync)); + } else { + LOGERR("failed to allocate memory for inode"); } return ino; @@ -55,7 +65,9 @@ int unifyfs_inode_destroy(struct unifyfs_inode* ino) free(ino->extents); } - pthread_rwlock_destroy(&ino->rwlock); + pthread_rwlock_destroy(&(ino->rwlock)); + ABT_mutex_free(&(ino->abt_sync)); + free(ino); } else { ret = EINVAL; @@ -103,23 +115,24 @@ void unifyfs_inode_unlock(struct unifyfs_inode* ino) int unifyfs_inode_create(int gfid, unifyfs_file_attr_t* attr) { - int ret = UNIFYFS_SUCCESS; - struct unifyfs_inode* ino = NULL; - - if (!attr) { + if (NULL == attr) { return EINVAL; } - ino = unifyfs_inode_alloc(gfid, attr); + struct unifyfs_inode* ino = unifyfs_inode_alloc(gfid, attr); + if (NULL == ino) { + return ENOMEM; + } + int ret = UNIFYFS_SUCCESS; unifyfs_inode_tree_wrlock(global_inode_tree); { ret = unifyfs_inode_tree_insert(global_inode_tree, ino); } unifyfs_inode_tree_unlock(global_inode_tree); - if (ret) { - free(ino); + if (ret != UNIFYFS_SUCCESS) { + unifyfs_inode_destroy(ino); } return ret; @@ -128,17 +141,17 @@ int unifyfs_inode_create(int gfid, unifyfs_file_attr_t* attr) int unifyfs_inode_update_attr(int gfid, int attr_op, unifyfs_file_attr_t* attr) { - int ret = UNIFYFS_SUCCESS; - struct unifyfs_inode* ino = NULL; - - if (!attr) { + if (NULL == attr) { return EINVAL; } + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + unifyfs_inode_tree_rdlock(global_inode_tree); { ino = unifyfs_inode_tree_search(global_inode_tree, gfid); - if (!ino) { + if (NULL == ino) { ret = ENOENT; } else { unifyfs_inode_wrlock(ino); @@ -170,14 +183,14 @@ int unifyfs_inode_metaget(int gfid, unifyfs_file_attr_t* attr) int ret = UNIFYFS_SUCCESS; struct unifyfs_inode* ino = NULL; - if (!global_inode_tree || !attr) { + if ((NULL == global_inode_tree) || (NULL == attr)) { return EINVAL; } unifyfs_inode_tree_rdlock(global_inode_tree); { ino = unifyfs_inode_tree_search(global_inode_tree, gfid); - if (ino) { + if (NULL != ino) { *attr = ino->attr; } else { ret = ENOENT; @@ -265,27 +278,19 @@ int unifyfs_inode_add_extents(int gfid, int num_extents, unifyfs_inode_wrlock(ino); { tree = ino->extents; - - /* create extent_tree if it doesn't exist yet */ if (NULL == tree) { - tree = (struct extent_tree*) calloc(1, sizeof(*tree)); - if (NULL == tree) { - LOGERR("failed to allocate memory for extent tree"); - goto out_unlock_inode; - } else { - extent_tree_init(tree); - ino->extents = tree; - } + LOGERR("inode extent tree is missing"); + goto out_unlock_inode; } for (i = 0; i < num_extents; i++) { struct extent_tree_node* current = &nodes[i]; - /* the output becomes too noisy with this: - * LOGDBG("new extent[%4d]: (%lu, %lu)", - * i, current->start, current->end); + /* debug output becomes too noisy with this: + * LOGDBG("extent[%4d]: [%lu, %lu] @ server[%d] log(%d:%d:%lu)", + * i, current->start, current->end, current->svr_rank, + * current->app_id, current->cli_id, current->pos); */ - ret = extent_tree_add(tree, current->start, current->end, current->svr_rank, current->app_id, current->cli_id, current->pos); @@ -318,7 +323,7 @@ int unifyfs_inode_add_extents(int gfid, int num_extents, return ret; } -int unifyfs_inode_get_filesize(int gfid, size_t* offset) +int unifyfs_inode_get_filesize(int gfid, size_t* outsize) { int ret = UNIFYFS_SUCCESS; size_t filesize = 0; @@ -338,7 +343,7 @@ int unifyfs_inode_get_filesize(int gfid, size_t* offset) } unifyfs_inode_unlock(ino); - *offset = filesize; + *outsize = filesize; LOGDBG("local file size (gfid=%d): %lu", gfid, filesize); } } @@ -471,6 +476,11 @@ int compare_chunk_read_reqs(const void* _c1, const void* _c2) } else if (c1->rank < c2->rank) { return -1; } else { + if (c1->offset > c2->offset) { + return 1; + } else if (c1->offset < c2->offset) { + return -1; + } return 0; } } @@ -503,14 +513,14 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, for (i = 0; i < n_extents; i++) { unifyfs_inode_extent_t* current = &extents[i]; - LOGDBG("resolving chunk request (gfid=%d, offset=%lu, length=%lu)", + LOGDBG("resolving extent request [gfid=%d, offset=%lu, length=%lu]", current->gfid, current->offset, current->length); ret = unifyfs_inode_get_extent_chunks(current, &n_resolved[i], &resolved[i]); if (ret) { - LOGERR("failed to resolve the chunk request for chunk " - "[gfid=%d, offset=%lu, length=%zu] (ret=%d)", + LOGERR("failed to resolve extent request " + "[gfid=%d, offset=%lu, length=%lu] (ret=%d)", current->gfid, current->offset, current->length, ret); goto out_fail; } @@ -530,8 +540,10 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, chunk_read_req_t* pos = chunks; for (i = 0; i < n_extents; i++) { + chunk_read_req_t* ext_chunks = resolved[i]; for (j = 0; j < n_resolved[i]; j++) { - *pos = resolved[i][j]; + /* debug_print_chunk_read_req(ext_chunks + j); */ + *pos = ext_chunks[j]; pos++; } if (resolved[i]) { @@ -539,14 +551,13 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, } } - /* sort the requests based on server rank */ - qsort(chunks, n_chunks, sizeof(*chunks), compare_chunk_read_reqs); - + if (n_chunks > 1) { + /* sort the requests based on server rank */ + qsort(chunks, n_chunks, sizeof(*chunks), compare_chunk_read_reqs); + } chunk_read_req_t* chk = chunks; for (i = 0; i < n_chunks; i++, chk++) { - LOGDBG(" [%d] (offset=%lu, nbytes=%lu) @ (%d log(%d:%d:%lu))", - i, chk->offset, chk->nbytes, chk->rank, - chk->log_client_id, chk->log_app_id, chk->log_offset); + debug_print_chunk_read_req(chk); } } diff --git a/server/src/unifyfs_inode.h b/server/src/unifyfs_inode.h index 714ab3732..e2412e4ac 100644 --- a/server/src/unifyfs_inode.h +++ b/server/src/unifyfs_inode.h @@ -83,8 +83,9 @@ int unifyfs_inode_metaset(int gfid, int attr_op, /** * @brief read attributes for file with @gfid. * - * @param gfid global file identifier - * @param attr [out] file attributes to be filled + * @param gfid global file identifier + * + * @param[out] attr output file attributes * * @return 0 on success, errno otherwise */ @@ -94,7 +95,7 @@ int unifyfs_inode_metaget(int gfid, unifyfs_file_attr_t* attr); * @brief unlink file with @gfid. this will remove the target file inode from * the global inode tree. * - * @param gfid global file identifier + * @param gfid global file identifier * * @return 0 on success, errno otherwise */ @@ -103,8 +104,8 @@ int unifyfs_inode_unlink(int gfid); /** * @brief truncate size of file with @gfid to @size. * - * @param gfid global file identifier - * @param size new file size + * @param gfid global file identifier + * @param size new file size * * @return 0 on success, errno otherwise */ @@ -113,9 +114,9 @@ int unifyfs_inode_truncate(int gfid, unsigned long size); /** * @brief get the local extent array from the target inode * - * @param gfid the global file identifier - * @param n the number of extents, set by this function - * @param nodes the pointer to the array of extents, caller should free this + * @param gfid the global file identifier + * @param n the number of extents, set by this function + * @param nodes the pointer to the array of extents, caller should free this * * @return 0 on success, errno otherwise */ @@ -125,9 +126,9 @@ int unifyfs_inode_get_extents(int gfid, size_t* n, /** * @brief add new extents to the inode * - * @param gfid the global file identifier - * @param n the number of new extents in @nodes - * @param nodes an array of extents to be added + * @param gfid the global file identifier + * @param n the number of new extents in @nodes + * @param nodes an array of extents to be added * * @return */ @@ -136,12 +137,13 @@ int unifyfs_inode_add_extents(int gfid, int n, struct extent_tree_node* nodes); /** * @brief get the maximum file size from the local extent tree of given file * - * @param gfid global file identifier - * @param offset [out] file offset to be filled by this function + * @param gfid global file identifier + * + * @param[out] outsize output file size * * @return 0 on success, errno otherwise */ -int unifyfs_inode_get_filesize(int gfid, size_t* offset); +int unifyfs_inode_get_filesize(int gfid, size_t* outsize); /** * @brief set the given file as laminated @@ -190,24 +192,24 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, * starting from lowest starting offset, sets outnum with actual number of * entries returned * - * @param gfid global file id - * @param start starting logical offset - * @param end ending logical offset - * @param max maximum number of key/vals to return - * @param keys array of length max for output keys - * @param vals array of length max for output values - * @param outnum number of entries returned + * @param gfid global file id + * @param start starting logical offset + * @param end ending logical offset + * @param max maximum number of key/vals to return + * + * @param[out] keys array of length max for output keys + * @param[out] vals array of length max for output values + * @param[out] outnum output number of entries returned * * @return */ -int unifyfs_inode_span_extents( - int gfid, /* global file id we're looking in */ - unsigned long start, /* starting logical offset */ - unsigned long end, /* ending logical offset */ - int max, /* maximum number of key/vals to return */ - void* keys, /* array of length max for output keys */ - void* vals, /* array of length max for output values */ - int* outnum); /* number of entries returned */ +int unifyfs_inode_span_extents(int gfid, + unsigned long start, + unsigned long end, + int max, + void* keys, + void* vals, + int* outnum); /** * @brief prints the inode information to the log stream diff --git a/server/src/unifyfs_inode_tree.c b/server/src/unifyfs_inode_tree.c index 311a6a994..ba38391d7 100644 --- a/server/src/unifyfs_inode_tree.c +++ b/server/src/unifyfs_inode_tree.c @@ -50,24 +50,22 @@ RB_GENERATE( int unifyfs_inode_tree_init( struct unifyfs_inode_tree* tree) { - int ret = 0; - - if (!tree) { + if (NULL == tree) { return EINVAL; } memset(tree, 0, sizeof(*tree)); - ret = pthread_rwlock_init(&tree->rwlock, NULL); + pthread_rwlock_init(&tree->rwlock, NULL); RB_INIT(&tree->head); - return ret; + return UNIFYFS_SUCCESS; } /* Remove and free all nodes in the unifyfs_inode_tree. */ void unifyfs_inode_tree_destroy( struct unifyfs_inode_tree* tree) { - if (tree) { + if (NULL != tree) { unifyfs_inode_tree_clear(tree); pthread_rwlock_destroy(&tree->rwlock); } @@ -77,10 +75,9 @@ int unifyfs_inode_tree_insert( struct unifyfs_inode_tree* tree, /* tree on which to add new entry */ struct unifyfs_inode* ino) /* initial file attribute */ { - int ret = 0; struct unifyfs_inode* existing = NULL; - if (!ino || (ino->gfid != ino->attr.gfid)) { + if ((NULL == ino) || (ino->gfid != ino->attr.gfid)) { return EINVAL; } @@ -92,7 +89,7 @@ int unifyfs_inode_tree_insert( RB_INSERT(rb_inode_tree, &tree->head, ino); - return ret; + return UNIFYFS_SUCCESS; } /* Search for and return entry for given gfid on specified tree. @@ -111,11 +108,8 @@ int unifyfs_inode_tree_remove( int gfid, struct unifyfs_inode** removed) { - int ret = 0; - struct unifyfs_inode* ino = NULL; - - ino = unifyfs_inode_tree_search(tree, gfid); - if (!ino) { + struct unifyfs_inode* ino = unifyfs_inode_tree_search(tree, gfid); + if (NULL == ino) { return ENOENT; } @@ -123,7 +117,7 @@ int unifyfs_inode_tree_remove( *removed = ino; - return ret; + return UNIFYFS_SUCCESS; } /* diff --git a/server/src/unifyfs_p2p_rpc.c b/server/src/unifyfs_p2p_rpc.c index 0efdd0e9f..ffe9c6a71 100644 --- a/server/src/unifyfs_p2p_rpc.c +++ b/server/src/unifyfs_p2p_rpc.c @@ -13,8 +13,6 @@ */ #include "unifyfs_global.h" -#include "margo_server.h" -#include "unifyfs_server_rpcs.h" #include "unifyfs_p2p_rpc.h" #include "unifyfs_group_rpc.h" @@ -28,23 +26,19 @@ int hash_gfid_to_server(int gfid) return gfid % glb_pmi_size; } -/* server peer-to-peer (p2p) margo request structure */ -typedef struct { - margo_request request; - hg_addr_t peer; - hg_handle_t handle; -} p2p_request; - /* helper method to initialize peer request rpc handle */ -static -int get_request_handle(hg_id_t request_hgid, - int peer_rank, - p2p_request* req) +int get_p2p_request_handle(hg_id_t request_hgid, + int peer_rank, + p2p_request* req) { int rc = UNIFYFS_SUCCESS; /* get address for specified server rank */ - req->peer = glb_servers[peer_rank].margo_svr_addr; + req->peer = get_margo_server_address(peer_rank); + if (HG_ADDR_NULL == req->peer) { + LOGERR("missing margo address for rank=%d", peer_rank); + return UNIFYFS_ERROR_MARGO; + } /* get handle to rpc function */ hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, req->peer, @@ -59,9 +53,8 @@ int get_request_handle(hg_id_t request_hgid, } /* helper method to forward peer rpc request */ -static -int forward_request(void* input_ptr, - p2p_request* req) +int forward_p2p_request(void* input_ptr, + p2p_request* req) { int rc = UNIFYFS_SUCCESS; @@ -77,8 +70,7 @@ int forward_request(void* input_ptr, } /* helper method to wait for peer rpc request completion */ -static -int wait_for_request(p2p_request* req) +int wait_for_p2p_request(p2p_request* req) { int rc = UNIFYFS_SUCCESS; @@ -92,80 +84,317 @@ int wait_for_request(p2p_request* req) return rc; } + /************************************************************************* - * File extents metadata update request + * File chunk reads request/response *************************************************************************/ -/* Add extents rpc handler */ -static void add_extents_rpc(hg_handle_t handle) +/* invokes the server-server chunk read request rpc */ +int invoke_chunk_read_request_rpc(int dst_srvr_rank, + server_read_req_t* rdreq, + server_chunk_reads_t* remote_reads) { - LOGDBG("add_extents rpc handler"); + int num_chunks = remote_reads->num_chunks; + if (dst_srvr_rank == glb_pmi_rank) { + // short-circuit for local requests + return sm_issue_chunk_reads(glb_pmi_rank, + rdreq->app_id, + rdreq->client_id, + rdreq->req_ndx, + num_chunks, + remote_reads->total_sz, + (char*)(remote_reads->reqs)); + } - /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + hg_handle_t handle; + chunk_read_request_in_t in; + chunk_read_request_out_t out; + hg_return_t hret; + hg_addr_t dst_srvr_addr; + hg_size_t bulk_sz = (hg_size_t)num_chunks * sizeof(chunk_read_req_t); + + assert(dst_srvr_rank < (int)glb_num_servers); + dst_srvr_addr = get_margo_server_address(dst_srvr_rank); + if (HG_ADDR_NULL == dst_srvr_addr) { + LOGERR("missing margo address for rank=%d", dst_srvr_rank); + return UNIFYFS_ERROR_MARGO; + } + + hret = margo_create(unifyfsd_rpc_context->svr_mid, dst_srvr_addr, + unifyfsd_rpc_context->rpcs.chunk_read_request_id, + &handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_create() failed"); + return UNIFYFS_ERROR_MARGO; + } + + /* fill in input struct */ + in.src_rank = (int32_t)glb_pmi_rank; + in.app_id = (int32_t)rdreq->app_id; + in.client_id = (int32_t)rdreq->client_id; + in.req_id = (int32_t)rdreq->req_ndx; + in.num_chks = (int32_t)num_chunks; + in.total_data_size = (hg_size_t)remote_reads->total_sz; + in.bulk_size = bulk_sz; + + /* register request buffer for bulk remote access */ + void* data_buf = remote_reads->reqs; + hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, + &data_buf, &bulk_sz, + HG_BULK_READ_ONLY, &in.bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + LOGDBG("invoking the chunk-read-request rpc function"); + hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* decode response */ + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + ret = (int)out.ret; + LOGDBG("Got request rpc response from %d - ret=%d", + dst_srvr_rank, ret); + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + } + + margo_bulk_free(in.bulk_handle); + } + margo_destroy(handle); + + return ret; +} + +/* handler for server-server chunk read request */ +static void chunk_read_request_rpc(hg_handle_t handle) +{ int32_t ret = UNIFYFS_SUCCESS; + hg_return_t hret; + + /* get input params */ + chunk_read_request_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* extract params from input struct */ + size_t bulk_sz = (size_t)in->bulk_size; + if (bulk_sz) { + /* allocate and register local target buffer for bulk access */ + void* reqbuf = pull_margo_bulk_buffer(handle, in->bulk_handle, + in->bulk_size, NULL); + if (NULL == reqbuf) { + LOGERR("failed to get bulk chunk reads"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_CHUNK_READ; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = reqbuf; + req->bulk_sz = bulk_sz; + ret = sm_submit_service_request(req); + } + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } else { + LOGWARN("empty chunk read request"); + ret = EINVAL; + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + if (NULL != req->bulk_buf) { + free(req->bulk_buf); + } + free(req); + } + + /* return to caller */ + chunk_read_request_out_t out; + out.ret = ret; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } +} +DEFINE_MARGO_RPC_HANDLER(chunk_read_request_rpc) + +/* Respond to chunk read request. Sends a set of read reply + * headers and corresponding data back to the requesting server. + * The headers and data are posted as a bulk transfer buffer */ +int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr) +{ + /* assume we'll succeed */ + int rc = UNIFYFS_SUCCESS; + + /* rank of destination server */ + int dst_rank = scr->rank; + assert(dst_rank < (int)glb_num_servers); + + /* get address of destinaton server */ + hg_addr_t dst_addr = get_margo_server_address(dst_rank); + if (HG_ADDR_NULL == dst_addr) { + LOGERR("missing margo address for rank=%d", dst_rank); + return UNIFYFS_ERROR_MARGO; + } + + /* pointer to struct containing rpc context info, + * shorter name for convience */ + ServerRpcContext_t* ctx = unifyfsd_rpc_context; + + /* get handle to read response rpc on destination server */ + hg_handle_t handle; + hg_id_t resp_id = ctx->rpcs.chunk_read_response_id; + hg_return_t hret = margo_create(ctx->svr_mid, dst_addr, + resp_id, &handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_create() failed"); + return UNIFYFS_ERROR_MARGO; + } + + /* get address and size of our response buffer */ + void* data_buf = (void*)scr->resp; + hg_size_t bulk_sz = scr->total_sz; + + /* register our response buffer for bulk remote read access */ + chunk_read_response_in_t in; + hret = margo_bulk_create(ctx->svr_mid, 1, &data_buf, &bulk_sz, + HG_BULK_READ_ONLY, &in.bulk_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + return UNIFYFS_ERROR_MARGO; + } + + /* fill in input struct */ + in.src_rank = (int32_t)glb_pmi_rank; + in.app_id = (int32_t)scr->app_id; + in.client_id = (int32_t)scr->client_id; + in.req_id = (int32_t)scr->rdreq_id; + in.num_chks = (int32_t)scr->num_chunks; + in.bulk_size = bulk_sz; + + /* call the read response rpc */ + LOGDBG("invoking the chunk-read-response rpc function"); + hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + rc = UNIFYFS_ERROR_MARGO; + } else { + /* rpc executed, now decode response */ + chunk_read_response_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + rc = (int)out.ret; + LOGDBG("chunk-read-response rpc to server[%d] - ret=%d", + dst_rank, rc); + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + rc = UNIFYFS_ERROR_MARGO; + } + } + + /* free resources allocated for executing margo rpc */ + margo_bulk_free(in.bulk_handle); + margo_destroy(handle); + + /* free response data buffer */ + free(data_buf); + scr->resp = NULL; - const struct hg_info* hgi = margo_get_info(handle); - assert(hgi); - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); + return rc; +} + +/* handler for server-server chunk read response */ +static void chunk_read_response_rpc(hg_handle_t handle) +{ + int32_t ret = UNIFYFS_SUCCESS; + chunk_read_response_out_t out; /* get input params */ - add_extents_in_t in; + chunk_read_response_in_t in; hg_return_t hret = margo_get_input(handle, &in); if (hret != HG_SUCCESS) { LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + ret = (int32_t) UNIFYFS_ERROR_MARGO; } else { - int sender = in.src_rank; - int gfid = in.gfid; - size_t num_extents = (size_t) in.num_extents; - size_t bulk_sz = num_extents * sizeof(struct extent_tree_node); - - /* allocate memory for extents */ - void* extents_buf = malloc(bulk_sz); - if (NULL == extents_buf) { - LOGERR("allocation for bulk extents failed"); - ret = ENOMEM; + /* extract params from input struct */ + int src_rank = (int)in.src_rank; + int app_id = (int)in.app_id; + int client_id = (int)in.client_id; + int req_id = (int)in.req_id; + int num_chks = (int)in.num_chks; + size_t bulk_sz = (size_t)in.bulk_size; + + LOGDBG("received read response from server[%d] (%d chunks)", + src_rank, num_chks); + + /* The input parameters specify the info for a bulk transfer + * buffer on the sending process. We use that info to pull data + * from the sender into a local buffer. This buffer contains + * the read reply headers and associated read data for requests + * we had sent earlier. */ + + /* pull the remote data via bulk transfer */ + if (0 == bulk_sz) { + /* sender is trying to send an empty buffer, + * don't think that should happen unless maybe + * we had sent a read request list that was empty? */ + LOGERR("empty response buffer"); + ret = (int32_t)EINVAL; } else { - /* register local target buffer for bulk access */ - hg_bulk_t bulk_handle; - hret = margo_bulk_create(mid, 1, &extents_buf, &bulk_sz, - HG_BULK_WRITE_ONLY, &bulk_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; + /* allocate a buffer to hold the incoming data */ + char* resp_buf = (char*) pull_margo_bulk_buffer(handle, + in.bulk_handle, + in.bulk_size, + NULL); + if (NULL == resp_buf) { + /* allocation failed, that's bad */ + LOGERR("failed to get chunk read responses buffer"); + ret = (int32_t)UNIFYFS_ERROR_MARGO; } else { - /* get list of read requests */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, - hgi->addr, in.extents, 0, - bulk_handle, 0, - bulk_sz); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_transfer() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* store new extents */ - LOGINFO("received %zu extents for gfid=%d from %d", - num_extents, gfid, sender); - struct extent_tree_node* extents = extents_buf; - ret = unifyfs_inode_add_extents(gfid, num_extents, extents); - if (ret) { - LOGERR("failed to add extents from %d (ret=%d)", - sender, ret); - } + LOGDBG("got chunk read responses buffer (%zu bytes)", bulk_sz); + + /* process read replies we just received */ + int rc = rm_post_chunk_read_responses(app_id, client_id, + src_rank, req_id, + num_chks, bulk_sz, + resp_buf); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to handle chunk read responses"); + ret = rc; } - margo_bulk_free(bulk_handle); } - free(extents_buf); } margo_free_input(handle, &in); } - /* build our output values */ - add_extents_out_t out; + /* return to caller */ out.ret = ret; - - /* send output back to caller */ hret = margo_respond(handle, &out); if (hret != HG_SUCCESS) { LOGERR("margo_respond() failed"); @@ -174,7 +403,12 @@ static void add_extents_rpc(hg_handle_t handle) /* free margo resources */ margo_destroy(handle); } -DEFINE_MARGO_RPC_HANDLER(add_extents_rpc) +DEFINE_MARGO_RPC_HANDLER(chunk_read_response_rpc) + + +/************************************************************************* + * File extents metadata update request + *************************************************************************/ /* Add extents to target file */ int unifyfs_invoke_add_extents_rpc(int gfid, @@ -190,7 +424,7 @@ int unifyfs_invoke_add_extents_rpc(int gfid, /* forward request to file owner */ p2p_request preq; hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_add_id; - int rc = get_request_handle(req_hgid, owner_rank, &preq); + int rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -213,14 +447,14 @@ int unifyfs_invoke_add_extents_rpc(int gfid, in.gfid = (int32_t) gfid; in.num_extents = (int32_t) num_extents; in.extents = bulk_handle; - rc = forward_request((void*)&in, &preq); + rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } margo_bulk_free(bulk_handle); /* wait for request completion */ - rc = wait_for_request(&preq); + rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -242,119 +476,77 @@ int unifyfs_invoke_add_extents_rpc(int gfid, return ret; } -/************************************************************************* - * File extents metadata lookup request - *************************************************************************/ - -/* find extents rpc handler */ -static void find_extents_rpc(hg_handle_t handle) +/* Add extents rpc handler */ +static void add_extents_rpc(hg_handle_t handle) { - LOGDBG("find_extents rpc handler"); - - int32_t ret; - unsigned int num_chunks = 0; - chunk_read_req_t* chunk_locs = NULL; - - const struct hg_info* hgi = margo_get_info(handle); - assert(hgi); + LOGDBG("add_extents rpc handler"); - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); + int ret = UNIFYFS_SUCCESS; /* get input params */ - find_extents_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + add_extents_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; } else { - int sender = in.src_rank; - int gfid = in.gfid; - size_t num_extents = (size_t) in.num_extents; - size_t bulk_sz = num_extents * sizeof(unifyfs_inode_extent_t); - - /* make sure I'm the owner */ - assert(glb_pmi_rank == hash_gfid_to_server(gfid)); - - /* allocate memory for extents */ - void* extents_buf = malloc(bulk_sz); - if (NULL == extents_buf) { - LOGERR("allocation for bulk extents failed"); - ret = ENOMEM; + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; } else { - /* register local target buffer for bulk access */ - hg_bulk_t bulk_req_handle; - hret = margo_bulk_create(mid, 1, &extents_buf, &bulk_sz, - HG_BULK_WRITE_ONLY, &bulk_req_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + size_t num_extents = (size_t) in->num_extents; + size_t bulk_sz = num_extents * sizeof(struct extent_tree_node); + + /* allocate memory for extents */ + void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, + bulk_sz, NULL); + if (NULL == extents_buf) { + LOGERR("failed to get bulk extents"); ret = UNIFYFS_ERROR_MARGO; } else { - /* get list of read requests */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, - hgi->addr, in.extents, 0, - bulk_req_handle, 0, - bulk_sz); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_transfer() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* lookup requested extents */ - unifyfs_inode_extent_t* extents = extents_buf; - unsigned int n_ext = (unsigned int) num_extents; - LOGDBG("received %u extent lookups for gfid=%d from %d", - n_ext, gfid, sender); - ret = unifyfs_inode_resolve_extent_chunks(n_ext, extents, - &num_chunks, - &chunk_locs); - if (ret) { - LOGERR("failed to find extents for %d (ret=%d)", - sender, ret); - } else if (num_chunks == 0) { - LOGDBG("extent lookup found no matching chunks"); - } - } - margo_bulk_free(bulk_req_handle); + req->req_type = UNIFYFS_SERVER_RPC_EXTENTS_ADD; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = extents_buf; + req->bulk_sz = bulk_sz; + ret = sm_submit_service_request(req); + } + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); } - free(extents_buf); } - margo_free_input(handle, &in); } - /* define a bulk handle to transfer chunk address info */ - hg_bulk_t bulk_resp_handle = HG_BULK_NULL; - if (ret == UNIFYFS_SUCCESS) { - if (num_chunks > 0) { - void* buf = (void*) chunk_locs; - size_t buf_sz = (size_t)num_chunks * sizeof(chunk_read_req_t); - hret = margo_bulk_create(mid, 1, &buf, &buf_sz, - HG_BULK_READ_ONLY, &bulk_resp_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + if (NULL != req->bulk_buf) { + free(req->bulk_buf); } + free(req); } - } - /* fill rpc response struct with output values */ - find_extents_out_t out; - out.ret = ret; - out.num_locations = num_chunks; - out.locations = bulk_resp_handle; - - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } + /* return to caller */ + add_extents_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - if (bulk_resp_handle != HG_BULK_NULL) { - margo_bulk_free(bulk_resp_handle); + /* free margo resources */ + margo_destroy(handle); } - margo_destroy(handle); } -DEFINE_MARGO_RPC_HANDLER(find_extents_rpc) +DEFINE_MARGO_RPC_HANDLER(add_extents_rpc) + + +/************************************************************************* + * File extents metadata lookup request + *************************************************************************/ /* Lookup extent locations for target file */ int unifyfs_invoke_find_extents_rpc(int gfid, @@ -373,13 +565,12 @@ int unifyfs_invoke_find_extents_rpc(int gfid, /* do local inode metadata lookup to check for laminated */ unifyfs_file_attr_t attrs; - int ret = unifyfs_inode_metaget(gfid, &attrs); + int ret = sm_get_fileattr(gfid, &attrs); if (ret == UNIFYFS_SUCCESS) { if (attrs.is_laminated || (owner_rank == glb_pmi_rank)) { /* do local lookup */ - ret = unifyfs_inode_resolve_extent_chunks(num_extents, - extents, - num_chunks, chunks); + ret = sm_find_extents(gfid, (size_t)num_extents, extents, + num_chunks, chunks); if (ret) { LOGERR("failed to find extents for gfid=%d (ret=%d)", gfid, ret); @@ -394,7 +585,7 @@ int unifyfs_invoke_find_extents_rpc(int gfid, p2p_request preq; margo_instance_id mid = unifyfsd_rpc_context->svr_mid; hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.extent_lookup_id; - int rc = get_request_handle(req_hgid, owner_rank, &preq); + int rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -416,14 +607,14 @@ int unifyfs_invoke_find_extents_rpc(int gfid, in.gfid = (int32_t) gfid; in.num_extents = (int32_t) num_extents; in.extents = bulk_req_handle; - rc = forward_request((void*)&in, &preq); + rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } margo_bulk_free(bulk_req_handle); /* wait for request completion */ - rc = wait_for_request(&preq); + rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -441,41 +632,20 @@ int unifyfs_invoke_find_extents_rpc(int gfid, /* get number of chunks */ unsigned int n_chks = (unsigned int) out.num_locations; if (n_chks > 0) { - /* got some chunks to read, allocate a buffer - * to hold chunk location data */ + /* got some chunks to read, get bulk buffer + * holding chunk location data */ buf_sz = (size_t)n_chks * sizeof(chunk_read_req_t); - buf = malloc(buf_sz); + buf = pull_margo_bulk_buffer(preq.handle, out.locations, buf_sz, + NULL); if (NULL == buf) { - LOGERR("allocation for bulk locations failed"); - ret = ENOMEM; + LOGERR("failed to get bulk chunk locations"); + ret = UNIFYFS_ERROR_MARGO; } else { - /* create a margo bulk transfer handle for - * locations array */ - hg_bulk_t bulk_resp_handle; - hret = margo_bulk_create(mid, 1, &buf, &buf_sz, - HG_BULK_WRITE_ONLY, - &bulk_resp_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* pull locations array */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, - preq.peer, out.locations, 0, - bulk_resp_handle, 0, - buf_sz); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_transfer() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* lookup requested extents */ - LOGDBG("received %u chunk locations for gfid=%d", - n_chks, gfid); - *chunks = (chunk_read_req_t*) buf; - *num_chunks = (unsigned int) n_chks; - } - margo_bulk_free(bulk_resp_handle); - } + /* lookup requested extents */ + LOGDBG("received %u chunk locations for gfid=%d", + n_chks, gfid); + *chunks = (chunk_read_req_t*) buf; + *num_chunks = (unsigned int) n_chks; } } } @@ -486,47 +656,80 @@ int unifyfs_invoke_find_extents_rpc(int gfid, return ret; } -/************************************************************************* - * File attributes request - *************************************************************************/ - -/* Metaget rpc handler */ -static void metaget_rpc(hg_handle_t handle) +/* find extents rpc handler */ +static void find_extents_rpc(hg_handle_t handle) { - LOGDBG("metaget rpc handler"); + LOGDBG("find_extents rpc handler"); int32_t ret; - /* initialize invalid attributes */ - unifyfs_file_attr_t attrs; - unifyfs_file_attr_set_invalid(&attrs); - /* get input params */ - metaget_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + find_extents_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; } else { - ret = unifyfs_inode_metaget(in.gfid, &attrs); - margo_free_input(handle, &in); + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + size_t num_extents = (size_t) in->num_extents; + size_t bulk_sz = num_extents * sizeof(unifyfs_inode_extent_t); + + /* allocate memory for extents */ + void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, + bulk_sz, NULL); + if (NULL == extents_buf) { + LOGERR("failed to get bulk extents"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_EXTENTS_FIND; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = extents_buf; + req->bulk_sz = bulk_sz; + ret = sm_submit_service_request(req); + } + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } } - /* fill output values */ - metaget_out_t out; - out.ret = ret; - out.attr = attrs; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + if (NULL != req->bulk_buf) { + free(req->bulk_buf); + } + free(req); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } + /* return to caller */ + find_extents_out_t out; + out.ret = (int32_t) ret; + out.num_locations = 0; + out.locations = HG_BULK_NULL; - /* free margo resources */ - margo_destroy(handle); + /* send output back to caller */ + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + margo_destroy(handle); + } } -DEFINE_MARGO_RPC_HANDLER(metaget_rpc) +DEFINE_MARGO_RPC_HANDLER(find_extents_rpc) + + +/************************************************************************* + * File attributes request + *************************************************************************/ /* Get file attributes for target file */ int unifyfs_invoke_metaget_rpc(int gfid, @@ -537,27 +740,38 @@ int unifyfs_invoke_metaget_rpc(int gfid, } int owner_rank = hash_gfid_to_server(gfid); + int need_local_metadata = 0; - /* do local inode metadata lookup to check for laminated */ - int rc = unifyfs_inode_metaget(gfid, attrs); - if ((rc == UNIFYFS_SUCCESS) && (attrs->is_laminated)) { - /* if laminated, we already have final metadata locally */ - return UNIFYFS_SUCCESS; - } + /* do local inode metadata lookup */ + int rc = sm_get_fileattr(gfid, attrs); if (owner_rank == glb_pmi_rank) { + /* local server is the owner */ return rc; - } + } else if (rc == UNIFYFS_SUCCESS) { + if (attrs->is_laminated) { + /* if laminated, we already have final metadata locally */ + return UNIFYFS_SUCCESS; + } - int need_local_metadata = 0; - if (rc == ENOENT) { - /* inode_metaget above failed with ENOENT, need to create inode */ + /* use cached attributes if within threshold */ + struct timespec tp = {0}; + clock_gettime(CLOCK_REALTIME, &tp); + time_t expire = attrs->ctime.tv_sec + UNIFYFS_METADATA_CACHE_SECONDS; + if (tp.tv_sec <= expire) { + LOGINFO("using cached attributes for gfid=%d", gfid); + return UNIFYFS_SUCCESS; + } else { + LOGINFO("cached attributes have expired"); + } + } else if (rc == ENOENT) { + /* metaget above failed with ENOENT, need to create inode */ need_local_metadata = 1; } /* forward request to file owner */ p2p_request preq; hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.metaget_id; - rc = get_request_handle(req_hgid, owner_rank, &preq); + rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -565,13 +779,13 @@ int unifyfs_invoke_metaget_rpc(int gfid, /* fill rpc input struct and forward request */ metaget_in_t in; in.gfid = (int32_t)gfid; - rc = forward_request((void*)&in, &preq); + rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } /* wait for request completion */ - rc = wait_for_request(&preq); + rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -592,8 +806,7 @@ int unifyfs_invoke_metaget_rpc(int gfid, attrs->filename = strdup(out.attr.filename); } if (need_local_metadata) { - unifyfs_inode_metaset(gfid, UNIFYFS_FILE_ATTR_OP_CREATE, - attrs); + sm_set_fileattr(gfid, UNIFYFS_FILE_ATTR_OP_CREATE, attrs); } } margo_free_output(preq.handle, &out); @@ -603,44 +816,64 @@ int unifyfs_invoke_metaget_rpc(int gfid, return ret; } -/************************************************************************* - * File size request - *************************************************************************/ - -/* Filesize rpc handler */ -static void filesize_rpc(hg_handle_t handle) +/* Metaget rpc handler */ +static void metaget_rpc(hg_handle_t handle) { - LOGDBG("filesize rpc handler"); + LOGDBG("metaget rpc handler"); - int32_t ret; - hg_size_t filesize = 0; + int ret = UNIFYFS_SUCCESS; /* get input params */ - filesize_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + metaget_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; } else { - ret = unifyfs_inode_get_filesize(in.gfid, &filesize); - margo_free_input(handle, &in); + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_METAGET; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } } - /* build our output values */ - filesize_out_t out; - out.ret = ret; - out.filesize = filesize; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + free(req); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } + /* return to caller */ + metaget_out_t out; + out.ret = (int32_t) ret; + unifyfs_file_attr_set_invalid(&(out.attr)); + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } -DEFINE_MARGO_RPC_HANDLER(filesize_rpc) +DEFINE_MARGO_RPC_HANDLER(metaget_rpc) + + +/************************************************************************* + * File size request + *************************************************************************/ /* Get current global size for the target file */ int unifyfs_invoke_filesize_rpc(int gfid, @@ -654,7 +887,7 @@ int unifyfs_invoke_filesize_rpc(int gfid, /* do local inode metadata lookup to check for laminated */ unifyfs_file_attr_t attrs; - int rc = unifyfs_inode_metaget(gfid, &attrs); + int rc = sm_get_fileattr(gfid, &attrs); if ((rc == UNIFYFS_SUCCESS) && (attrs.is_laminated)) { /* if laminated, we already have final metadata stored locally */ *filesize = (size_t) attrs.size; @@ -668,7 +901,7 @@ int unifyfs_invoke_filesize_rpc(int gfid, /* forward request to file owner */ p2p_request preq; hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.filesize_id; - rc = get_request_handle(req_hgid, owner_rank, &preq); + rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -676,13 +909,13 @@ int unifyfs_invoke_filesize_rpc(int gfid, /* fill rpc input struct and forward request */ filesize_in_t in; in.gfid = (int32_t)gfid; - rc = forward_request((void*)&in, &preq); + rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } /* wait for request completion */ - rc = wait_for_request(&preq); + rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -707,43 +940,64 @@ int unifyfs_invoke_filesize_rpc(int gfid, return ret; } -/************************************************************************* - * File attributes update request - *************************************************************************/ - -/* Metaset rpc handler */ -static void metaset_rpc(hg_handle_t handle) +/* Filesize rpc handler */ +static void filesize_rpc(hg_handle_t handle) { - LOGDBG("metaset rpc handler"); + LOGDBG("filesize rpc handler"); - int32_t ret; + int ret = UNIFYFS_SUCCESS; /* get input params */ - metaset_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + filesize_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; } else { - unifyfs_file_attr_op_e attr_op = in.fileop; - ret = unifyfs_inode_metaset(in.gfid, attr_op, &(in.attr)); - margo_free_input(handle, &in); + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_FILESIZE; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } } - /* build our output values */ - metaset_out_t out; - out.ret = ret; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + free(req); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } + /* return to caller */ + filesize_out_t out; + out.ret = (int32_t) ret; + out.filesize = 0; + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } -DEFINE_MARGO_RPC_HANDLER(metaset_rpc) +DEFINE_MARGO_RPC_HANDLER(filesize_rpc) + + +/************************************************************************* + * File attributes update request + *************************************************************************/ /* Set metadata for target file */ int unifyfs_invoke_metaset_rpc(int gfid, @@ -754,16 +1008,21 @@ int unifyfs_invoke_metaset_rpc(int gfid, return EINVAL; } + int ret = sm_set_fileattr(gfid, attr_op, attrs); + if (ret != UNIFYFS_SUCCESS) { + return ret; + } + int owner_rank = hash_gfid_to_server(gfid); if (owner_rank == glb_pmi_rank) { - /* I'm the owner, do local inode metadata update */ - return unifyfs_inode_metaset(gfid, attr_op, attrs); + /* I'm the owner, return local result */ + return ret; } /* forward request to file owner */ p2p_request preq; hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.metaset_id; - int rc = get_request_handle(req_hgid, owner_rank, &preq); + int rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -773,19 +1032,18 @@ int unifyfs_invoke_metaset_rpc(int gfid, in.gfid = (int32_t) gfid; in.fileop = (int32_t) attr_op; in.attr = *attrs; - rc = forward_request((void*)&in, &preq); + rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } /* wait for request completion */ - rc = wait_for_request(&preq); + rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { return rc; } /* get the output of the rpc */ - int ret; metaset_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { @@ -795,59 +1053,69 @@ int unifyfs_invoke_metaset_rpc(int gfid, /* set return value */ ret = out.ret; margo_free_output(preq.handle, &out); - - /* if update at owner succeeded, do it locally */ - if (ret == UNIFYFS_SUCCESS) { - ret = unifyfs_inode_metaset(gfid, attr_op, attrs); - } } margo_destroy(preq.handle); return ret; } -/************************************************************************* - * File lamination request - *************************************************************************/ - -/* Laminate rpc handler */ -static void laminate_rpc(hg_handle_t handle) +/* Metaset rpc handler */ +static void metaset_rpc(hg_handle_t handle) { - LOGDBG("laminate rpc handler"); + LOGDBG("metaset rpc handler"); - int32_t ret; + int ret = UNIFYFS_SUCCESS; /* get input params */ - laminate_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + metaset_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; } else { - int gfid = (int) in.gfid; - margo_free_input(handle, &in); - - ret = unifyfs_inode_laminate(gfid); - if (ret == UNIFYFS_SUCCESS) { - /* tell the rest of the servers */ - ret = unifyfs_invoke_broadcast_laminate(gfid); + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_METASET; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } } } - /* build our output values */ - laminate_out_t out; - out.ret = ret; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + free(req); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } + /* return to caller */ + metaset_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } -DEFINE_MARGO_RPC_HANDLER(laminate_rpc) +DEFINE_MARGO_RPC_HANDLER(metaset_rpc) + + +/************************************************************************* + * File lamination request + *************************************************************************/ /* Laminate the target file */ int unifyfs_invoke_laminate_rpc(int gfid) @@ -856,18 +1124,13 @@ int unifyfs_invoke_laminate_rpc(int gfid) int owner_rank = hash_gfid_to_server(gfid); if (owner_rank == glb_pmi_rank) { /* I'm the owner, do local inode metadata update */ - ret = unifyfs_inode_laminate(gfid); - if (ret == UNIFYFS_SUCCESS) { - /* tell the rest of the servers */ - ret = unifyfs_invoke_broadcast_laminate(gfid); - } - return ret; + return sm_laminate(gfid); } /* forward request to file owner */ p2p_request preq; hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.laminate_id; - int rc = get_request_handle(req_hgid, owner_rank, &preq); + int rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -875,13 +1138,13 @@ int unifyfs_invoke_laminate_rpc(int gfid) /* fill rpc input struct and forward request */ laminate_in_t in; in.gfid = (int32_t)gfid; - rc = forward_request((void*)&in, &preq); + rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } /* wait for request completion */ - rc = wait_for_request(&preq); + rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -902,48 +1165,63 @@ int unifyfs_invoke_laminate_rpc(int gfid) return ret; } -/************************************************************************* - * File truncation request - *************************************************************************/ - -/* Truncate rpc handler */ -static void truncate_rpc(hg_handle_t handle) +/* Laminate rpc handler */ +static void laminate_rpc(hg_handle_t handle) { - LOGDBG("truncate rpc handler"); + LOGDBG("laminate rpc handler"); - int32_t ret; + int ret = UNIFYFS_SUCCESS; /* get input params */ - truncate_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; + laminate_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; } else { - int gfid = (int) in.gfid; - size_t fsize = (size_t) in.filesize; - ret = unifyfs_invoke_broadcast_truncate(gfid, fsize); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("truncate(gfid=%d, size=%zu) broadcast failed", - gfid, fsize); + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_LAMINATE; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } } - margo_free_input(handle, &in); } - /* build our output values */ - truncate_out_t out; - out.ret = ret; + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + free(req); + } - /* send output back to caller */ - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } + /* return to caller */ + laminate_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* free margo resources */ - margo_destroy(handle); + /* free margo resources */ + margo_destroy(handle); + } } -DEFINE_MARGO_RPC_HANDLER(truncate_rpc) +DEFINE_MARGO_RPC_HANDLER(laminate_rpc) + + +/************************************************************************* + * File truncation request + *************************************************************************/ /* Truncate the target file */ int unifyfs_invoke_truncate_rpc(int gfid, @@ -951,15 +1229,13 @@ int unifyfs_invoke_truncate_rpc(int gfid, { int owner_rank = hash_gfid_to_server(gfid); if (owner_rank == glb_pmi_rank) { - /* I'm the owner, start broadcast update. The local inode will be - * updated as part of this update. */ - return unifyfs_invoke_broadcast_truncate(gfid, filesize); + return sm_truncate(gfid, filesize); } /* forward request to file owner */ p2p_request preq; hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.truncate_id; - int rc = get_request_handle(req_hgid, owner_rank, &preq); + int rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -968,13 +1244,13 @@ int unifyfs_invoke_truncate_rpc(int gfid, truncate_in_t in; in.gfid = (int32_t) gfid; in.filesize = (hg_size_t) filesize; - rc = forward_request((void*)&in, &preq); + rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { return rc; } /* wait for request completion */ - rc = wait_for_request(&preq); + rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -995,3 +1271,154 @@ int unifyfs_invoke_truncate_rpc(int gfid, return ret; } + +/* Truncate rpc handler */ +static void truncate_rpc(hg_handle_t handle) +{ + LOGDBG("truncate rpc handler"); + + int ret = UNIFYFS_SUCCESS; + + /* get input params */ + truncate_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; + } else { + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_TRUNCATE; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + free(req); + } + + /* return to caller */ + truncate_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } +} +DEFINE_MARGO_RPC_HANDLER(truncate_rpc) + +/************************************************************************* + * Server pid report + *************************************************************************/ + +int unifyfs_invoke_server_pid_rpc(void) +{ + /* forward pid to server rank 0 */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.server_pid_id; + int rc = get_p2p_request_handle(req_hgid, 0, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* fill rpc input struct and forward request */ + server_pid_in_t in; + in.rank = glb_pmi_rank; + in.pid = server_pid; + rc = forward_p2p_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* wait for request completion */ + rc = wait_for_p2p_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + int ret; + server_pid_out_t out; + hg_return_t hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} + +static void server_pid_rpc(hg_handle_t handle) +{ + LOGDBG("server pid report rpc handler"); + + int ret = UNIFYFS_SUCCESS; + + /* get input params */ + server_pid_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; + } else { + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_PID_REPORT; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + free(req); + } + + /* return to caller */ + server_pid_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } +} +DEFINE_MARGO_RPC_HANDLER(server_pid_rpc) diff --git a/server/src/unifyfs_p2p_rpc.h b/server/src/unifyfs_p2p_rpc.h index 9710a8ca4..70af349c6 100644 --- a/server/src/unifyfs_p2p_rpc.h +++ b/server/src/unifyfs_p2p_rpc.h @@ -17,15 +17,56 @@ #include "unifyfs_global.h" #include "extent_tree.h" +#include "margo_server.h" #include "unifyfs_inode.h" - - -/* Point-to-point Server RPCs */ +#include "unifyfs_request_manager.h" +#include "unifyfs_service_manager.h" /* determine server responsible for maintaining target file's metadata */ int hash_gfid_to_server(int gfid); +/* server peer-to-peer (p2p) margo request structure */ +typedef struct { + margo_request request; + hg_addr_t peer; + hg_handle_t handle; +} p2p_request; + +/* helper method to initialize peer request rpc handle */ +int get_p2p_request_handle(hg_id_t request_hgid, + int peer_rank, + p2p_request* req); + +/* helper method to forward peer rpc request */ +int forward_p2p_request(void* input_ptr, + p2p_request* req); + +/* helper method to wait for peer rpc request completion */ +int wait_for_p2p_request(p2p_request* req); + +/*** Point-to-point Server RPCs ***/ + +/** + * @brief Request chunk reads from remote server + * + * @param dst_srvr_rank remote server rank + * @param rdreq read request structure + * @param remote_reads server chunk reads + * + * @return success|failure + */ +int invoke_chunk_read_request_rpc(int dst_srvr_rank, + server_read_req_t* rdreq, + server_chunk_reads_t* remote_reads); +/** + * @brief Respond to chunk read request + * + * @param scr server chunk reads structure + * + * @return success|failure + */ +int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr); /** * @brief Add new extents to target file @@ -112,5 +153,11 @@ int unifyfs_invoke_metaset_rpc(int gfid, int attr_op, */ int unifyfs_invoke_truncate_rpc(int gfid, size_t filesize); +/** + * @brief Report pid of local server to rank 0 server + * + * @return success|failure + */ +int unifyfs_invoke_server_pid_rpc(void); #endif // UNIFYFS_P2P_RPC_H diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index 072a3ad8e..6c2b71db5 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -37,9 +37,11 @@ #include "unifyfs_service_manager.h" // margo rpcs +#include "margo_server.h" #include "unifyfs_group_rpc.h" +#include "unifyfs_p2p_rpc.h" + #include "unifyfs_server_rpcs.h" -#include "margo_server.h" #define RM_LOCK(rm) \ @@ -243,44 +245,28 @@ int rm_release_read_req(reqmgr_thrd_t* thrd_ctrl, static void signal_new_requests(reqmgr_thrd_t* reqmgr) { - RM_LOCK(reqmgr); pid_t this_thread = unifyfs_gettid(); if (this_thread != reqmgr->tid) { - /* wake up the request manager thread for the requesting client */ - if (!reqmgr->waiting_for_work) { - /* reqmgr thread is not waiting, but we are in critical - * section, we just added requests so we must wait for reqmgr - * to signal us that it's reached the critical section before - * we escape so we don't overwrite these requests before it - * has had a chance to process them */ - reqmgr->has_waiting_dispatcher = 1; - pthread_cond_wait(&reqmgr->thrd_cond, &reqmgr->thrd_lock); - - /* reqmgr thread has signaled us that it's now waiting */ - reqmgr->has_waiting_dispatcher = 0; - } - /* have a reqmgr thread waiting on condition variable, - * signal it to begin processing the requests we just added */ + /* signal reqmgr to begin processing the requests we just added */ LOGDBG("signaling new requests"); pthread_cond_signal(&reqmgr->thrd_cond); } - RM_UNLOCK(reqmgr); } static void signal_new_responses(reqmgr_thrd_t* reqmgr) { - RM_LOCK(reqmgr); pid_t this_thread = unifyfs_gettid(); if (this_thread != reqmgr->tid) { /* wake up the request manager thread */ + RM_LOCK(reqmgr); if (reqmgr->waiting_for_work) { /* have a reqmgr thread waiting on condition variable, * signal it to begin processing the responses we just added */ LOGDBG("signaling new responses"); pthread_cond_signal(&reqmgr->thrd_cond); } + RM_UNLOCK(reqmgr); } - RM_UNLOCK(reqmgr); } /* issue remote chunk read requests for extent chunks @@ -493,51 +479,6 @@ int rm_request_exit(reqmgr_thrd_t* thrd_ctrl) * These functions define the logic of the request manager thread ***********************/ -/* pack the chunk read requests for a single remote server. - * - * @param req_msg_buf: request buffer used for packing - * @param req_num: number of read requests - * @return size of packed buffer (or error code) - */ -static size_t rm_pack_chunk_requests(char* req_msg_buf, - server_chunk_reads_t* remote_reads) -{ - /* send format: - * (int) cmd - specifies type of message (SVC_CMD_RDREQ_CHK) - * (int) req_cnt - number of requests in message - * (size_t) total_sz - total number of bytes requested - * {sequence of chunk_read_req_t} */ - int req_cnt = remote_reads->num_chunks; - size_t reqs_sz = req_cnt * sizeof(chunk_read_req_t); - size_t packed_size = (2 * sizeof(int)) + sizeof(size_t) + reqs_sz; - - assert(req_cnt <= MAX_META_PER_SEND); - - /* get pointer to start of send buffer */ - char* ptr = req_msg_buf; - memset(ptr, 0, packed_size); - - /* pack command */ - int cmd = (int)SVC_CMD_RDREQ_CHK; - *((int*)ptr) = cmd; - ptr += sizeof(int); - - /* pack request count */ - *((int*)ptr) = req_cnt; - ptr += sizeof(int); - - /* pack total requested data size */ - *((size_t*)ptr) = remote_reads->total_sz; - ptr += sizeof(size_t); - - /* copy requests into buffer */ - memcpy(ptr, remote_reads->reqs, reqs_sz); - ptr += reqs_sz; - - /* return number of bytes used to pack requests */ - return packed_size; -} - /* send the chunk read requests to remote servers * * @param thrd_ctrl : reqmgr thread control structure @@ -548,13 +489,13 @@ static int rm_request_remote_chunks(reqmgr_thrd_t* thrd_ctrl) int i, j, rc; int ret = (int)UNIFYFS_SUCCESS; - /* get pointer to send buffer */ - char* sendbuf = thrd_ctrl->del_req_msg_buf; - /* iterate over each active read request */ RM_REQ_LOCK(thrd_ctrl); for (i = 0; i < RM_MAX_SERVER_READS; i++) { server_read_req_t* req = thrd_ctrl->read_reqs + i; + if (!req->in_use) { + continue; + } if (req->num_server_reads > 0) { LOGDBG("read req %d is active", i); debug_print_read_req(req); @@ -562,28 +503,21 @@ static int rm_request_remote_chunks(reqmgr_thrd_t* thrd_ctrl) req->status = READREQ_STARTED; /* iterate over each server we need to send requests to */ server_chunk_reads_t* remote_reads; - size_t packed_sz; for (j = 0; j < req->num_server_reads; j++) { remote_reads = req->remote_reads + j; remote_reads->status = READREQ_STARTED; - /* pack requests into send buffer, get packed size */ - packed_sz = rm_pack_chunk_requests(sendbuf, remote_reads); - - /* get rank of target server */ - int del_rank = remote_reads->rank; - /* send requests */ - LOGDBG("[%d of %d] sending %d chunk requests to server %d", + int remote_rank = remote_reads->rank; + LOGDBG("[%d of %d] sending %d chunk requests to server[%d]", j, req->num_server_reads, - remote_reads->num_chunks, del_rank); - rc = invoke_chunk_read_request_rpc(del_rank, req, - remote_reads->num_chunks, - sendbuf, packed_sz); - if (rc != (int)UNIFYFS_SUCCESS) { + remote_reads->num_chunks, remote_rank); + rc = invoke_chunk_read_request_rpc(remote_rank, req, + remote_reads); + if (rc != UNIFYFS_SUCCESS) { ret = rc; LOGERR("server request rpc to %d failed - %s", - del_rank, + remote_rank, unifyfs_rc_enum_str((unifyfs_rc)rc)); } } @@ -617,6 +551,9 @@ static int rm_process_remote_chunk_responses(reqmgr_thrd_t* thrd_ctrl) /* iterate over each active read request */ for (i = 0; i < RM_MAX_SERVER_READS; i++) { server_read_req_t* req = thrd_ctrl->read_reqs + i; + if (!req->in_use) { + continue; + } if (req->status == READREQ_STARTED) { if (req->num_server_reads > 0) { /* iterate over each server we sent requests to */ @@ -858,8 +795,8 @@ int rm_handle_chunk_read_responses(reqmgr_thrd_t* thrd_ctrl, mread_id, read_ndx, errcode); if (rc != UNIFYFS_SUCCESS) { - LOGERR("mread[%d] request %d completion rpc failed", - mread_id, read_ndx); + LOGERR("mread[%d] request %d completion rpc failed (rc=%d)", + mread_id, read_ndx, rc); ret = rc; } } @@ -1103,7 +1040,9 @@ static int process_metaset_rpc(reqmgr_thrd_t* reqmgr, if (NULL != in->attr.filename) { fattr.filename = strdup(in->attr.filename); } - margo_free_input(req->handle, in); + if (HG_HANDLE_NULL != req->handle) { + margo_free_input(req->handle, in); + } free(in); LOGDBG("setting metadata for gfid=%d", gfid); @@ -1121,17 +1060,18 @@ static int process_metaset_rpc(reqmgr_thrd_t* reqmgr, free(fattr.filename); } - /* send rpc response */ - unifyfs_metaset_out_t out; - out.ret = (int32_t) ret; - hg_return_t hret = margo_respond(req->handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } - - /* cleanup req */ - margo_destroy(req->handle); + if (HG_HANDLE_NULL != req->handle) { + /* send rpc response */ + unifyfs_metaset_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + /* cleanup req */ + margo_destroy(req->handle); + } return ret; } @@ -1158,6 +1098,7 @@ static int process_read_rpc(reqmgr_thrd_t* reqmgr, if (ret != UNIFYFS_SUCCESS) { LOGERR("unifyfs_fops_read() failed"); } + free(req->bulk_buf); /* send rpc response */ unifyfs_mread_out_t out; @@ -1342,9 +1283,11 @@ void* request_manager_thread(void* arg) { /* get pointer to our thread control structure */ reqmgr_thrd_t* thrd_ctrl = (reqmgr_thrd_t*) arg; + int appid = thrd_ctrl->app_id; + int clid = thrd_ctrl->client_id; thrd_ctrl->tid = unifyfs_gettid(); - LOGDBG("I am request manager thread!"); + LOGINFO("I am request manager [app=%d:client=%d] thread!", appid, clid); /* loop forever to handle read requests from the client, * new requests are added to a list on a shared data structure @@ -1376,18 +1319,8 @@ void* request_manager_thread(void* arg) * inside the critical section */ thrd_ctrl->waiting_for_work = 1; - /* if dispatcher is waiting on us, signal it to go ahead, - * this coordination ensures that we'll be the next thread - * to grab the lock after the dispatcher has assigned us - * some work (rather than the dispatcher grabbing the lock - * and assigning yet more work) */ - if (thrd_ctrl->has_waiting_dispatcher == 1) { - pthread_cond_signal(&thrd_ctrl->thrd_cond); - } - /* release lock and wait to be signaled by dispatcher */ - LOGDBG("RM[%d:%d] waiting for work", - thrd_ctrl->app_id, thrd_ctrl->client_id); + //LOGDBG("RM[%d:%d] waiting for work", appid, clid); struct timespec timeout; clock_gettime(CLOCK_REALTIME, &timeout); timeout.tv_nsec += 10000000; /* 10 ms */ @@ -1399,11 +1332,10 @@ void* request_manager_thread(void* arg) &thrd_ctrl->thrd_lock, &timeout); if (0 == wait_rc) { - LOGDBG("RM[%d:%d] got work", - thrd_ctrl->app_id, thrd_ctrl->client_id); + LOGDBG("RM[%d:%d] got work", appid, clid); } else if (ETIMEDOUT != wait_rc) { LOGERR("RM[%d:%d] work condition wait failed (rc=%d)", - thrd_ctrl->app_id, thrd_ctrl->client_id, wait_rc); + appid, clid, wait_rc); } /* set flag to indicate we're no longer waiting */ @@ -1416,219 +1348,8 @@ void* request_manager_thread(void* arg) } } - LOGDBG("request manager thread exiting"); + LOGDBG("RM[%d:%d] thread exiting", appid, clid); return NULL; } -/* BEGIN MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ - -/* invokes the server_request rpc */ -int invoke_chunk_read_request_rpc(int dst_srvr_rank, - server_read_req_t* rdreq, - int num_chunks, - void* data_buf, size_t buf_sz) -{ - if (dst_srvr_rank == glb_pmi_rank) { - // short-circuit for local requests - return sm_issue_chunk_reads(glb_pmi_rank, - rdreq->app_id, - rdreq->client_id, - rdreq->req_ndx, - num_chunks, - (char*)data_buf); - } - - int ret = UNIFYFS_SUCCESS; - hg_handle_t handle; - chunk_read_request_in_t in; - chunk_read_request_out_t out; - hg_return_t hret; - hg_addr_t dst_srvr_addr; - hg_size_t bulk_sz = buf_sz; - - assert(dst_srvr_rank < (int)glb_num_servers); - dst_srvr_addr = glb_servers[dst_srvr_rank].margo_svr_addr; - - hret = margo_create(unifyfsd_rpc_context->svr_mid, dst_srvr_addr, - unifyfsd_rpc_context->rpcs.chunk_read_request_id, - &handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_create() failed"); - return UNIFYFS_ERROR_MARGO; - } - - /* fill in input struct */ - in.src_rank = (int32_t)glb_pmi_rank; - in.app_id = (int32_t)rdreq->app_id; - in.client_id = (int32_t)rdreq->client_id; - in.req_id = (int32_t)rdreq->req_ndx; - in.num_chks = (int32_t)num_chunks; - in.bulk_size = bulk_sz; - - /* register request buffer for bulk remote access */ - hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, - &data_buf, &bulk_sz, - HG_BULK_READ_ONLY, &in.bulk_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - LOGDBG("invoking the chunk-read-request rpc function"); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* decode response */ - hret = margo_get_output(handle, &out); - if (hret == HG_SUCCESS) { - ret = (int)out.ret; - LOGDBG("Got request rpc response from %d - ret=%d", - dst_srvr_rank, ret); - margo_free_output(handle, &out); - } else { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } - } - - margo_bulk_free(in.bulk_handle); - } - margo_destroy(handle); - - return ret; -} - - -/* BEGIN MARGO SERVER-SERVER RPC HANDLER FUNCTIONS */ - -/* handler for remote read request response */ -static void chunk_read_response_rpc(hg_handle_t handle) -{ - int32_t ret; - chunk_read_response_out_t out; - - /* get input params */ - chunk_read_response_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = (int32_t) UNIFYFS_ERROR_MARGO; - } else { - /* extract params from input struct */ - int src_rank = (int)in.src_rank; - int app_id = (int)in.app_id; - int client_id = (int)in.client_id; - int req_id = (int)in.req_id; - int num_chks = (int)in.num_chks; - size_t bulk_sz = (size_t)in.bulk_size; - - LOGDBG("received chunk read response from server %d (%d chunks)", - src_rank, num_chks); - - /* The input parameters specify the info for a bulk transfer - * buffer on the sending process. We use that info to pull data - * from the sender into a local buffer. This buffer contains - * the read reply headers and associated read data for requests - * we had sent earlier. */ - - /* pull the remote data via bulk transfer */ - if (0 == bulk_sz) { - /* sender is trying to send an empty buffer, - * don't think that should happen unless maybe - * we had sent a read request list that was empty? */ - LOGERR("empty response buffer"); - ret = (int32_t)EINVAL; - } else { - /* allocate a buffer to hold the incoming data */ - char* resp_buf = (char*) malloc(bulk_sz); - if (NULL == resp_buf) { - /* allocation failed, that's bad */ - LOGERR("failed to allocate chunk read responses buffer"); - ret = (int32_t)ENOMEM; - } else { - /* got a buffer, now pull response data */ - ret = (int32_t)UNIFYFS_SUCCESS; - - /* get margo info */ - const struct hg_info* hgi = margo_get_info(handle); - assert(NULL != hgi); - - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); - - /* pass along address of buffer we want to transfer - * data into to prepare it for a bulk write, - * get resulting margo handle */ - hg_bulk_t bulk_handle; - hret = margo_bulk_create(mid, 1, - (void**)&resp_buf, &in.bulk_size, - HG_BULK_WRITE_ONLY, &bulk_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; - goto out_respond; - } - - /* execute the transfer to pull data from remote side - * into our local bulk transfer buffer. - * NOTE: mercury/margo bulk transfer does not check the maximum - * transfer size that the underlying transport supports, and a - * large bulk transfer may result in failure. */ - int i = 0; - hg_size_t remain = in.bulk_size; - do { - hg_size_t offset = i * MAX_BULK_TX_SIZE; - hg_size_t len = remain < MAX_BULK_TX_SIZE - ? remain : MAX_BULK_TX_SIZE; - - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in.bulk_handle, offset, - bulk_handle, offset, len); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_transfer(off=%zu, sz=%zu) failed", - (size_t)offset, (size_t)len); - ret = UNIFYFS_ERROR_MARGO; - break; - } - - remain -= len; - i++; - } while (remain > 0); - - if (hret == HG_SUCCESS) { - LOGDBG("successful bulk transfer (%zu bytes)", bulk_sz); - - /* process read replies we just received */ - int rc = rm_post_chunk_read_responses(app_id, client_id, - src_rank, req_id, - num_chks, bulk_sz, - resp_buf); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to handle chunk read responses"); - ret = rc; - } - } else { - LOGERR("failed to perform bulk transfer"); - } - - /* deregister our bulk transfer buffer */ - margo_bulk_free(bulk_handle); - } - } - margo_free_input(handle, &in); - } - -out_respond: - /* return to caller */ - out.ret = ret; - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); - } - - /* free margo resources */ - margo_destroy(handle); -} -DEFINE_MARGO_RPC_HANDLER(chunk_read_response_rpc) diff --git a/server/src/unifyfs_request_manager.h b/server/src/unifyfs_request_manager.h index a2d54a484..e568f02f0 100644 --- a/server/src/unifyfs_request_manager.h +++ b/server/src/unifyfs_request_manager.h @@ -90,9 +90,6 @@ typedef struct reqmgr_thrd { /* list of client rpc requests */ arraylist_t* client_reqs; - /* buffer to build read request messages */ - char del_req_msg_buf[REQ_BUF_LEN]; - /* flag set to indicate request manager thread should exit */ int exit_flag; @@ -158,19 +155,12 @@ int rm_submit_read_request(server_read_req_t* req); /** * @brief submit a client rpc request to the request manager thread. * - * @param client application client context - * @param req pointer to client rpc request struct + * @param ctx application client context + * @param req pointer to client rpc request struct * * @return UNIFYFS_SUCCESS, or error code */ int rm_submit_client_rpc_request(unifyfs_fops_ctx_t* ctx, client_rpc_req_t* req); -/* MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ - -int invoke_chunk_read_request_rpc(int dst_srvr_rank, - server_read_req_t* rdreq, - int num_chunks, - void* data_buf, size_t buf_sz); - #endif diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index 672763391..50ed9115e 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -62,13 +62,6 @@ static ABT_mutex app_configs_abt_sync; static app_config* app_configs[MAX_NUM_APPS]; /* list of apps */ static size_t clients_per_app = MAX_APP_CLIENTS; -/** - * @brief create a ready status file to notify that all servers are ready for - * accepting client requests. - * - * @return 0 on success, error otherwise - */ -int unifyfs_publish_server_pids(void); static int unifyfs_exit(void); @@ -348,12 +341,12 @@ int main(int argc, char* argv[]) exit(1); } if (glb_pmi_rank != kv_rank) { - LOGDBG("mismatch on pmi (%d) vs kvstore (%d) rank", + LOGWARN("mismatch on pmi (%d) vs kvstore (%d) rank", glb_pmi_rank, kv_rank); glb_pmi_rank = kv_rank; } if (glb_pmi_size != kv_nranks) { - LOGDBG("mismatch on pmi (%d) vs kvstore (%d) num ranks", + LOGWARN("mismatch on pmi (%d) vs kvstore (%d) num ranks", glb_pmi_size, kv_nranks); glb_pmi_size = kv_nranks; } @@ -372,21 +365,17 @@ int main(int argc, char* argv[]) LOGDBG("initializing rpc service"); ABT_init(argc, argv); ABT_mutex_create(&app_configs_abt_sync); - rc = configurator_bool_val(server_cfg.margo_tcp, &margo_use_tcp); + rc = configurator_bool_val(server_cfg.margo_lazy_connect, + &margo_lazy_connect); + rc = configurator_bool_val(server_cfg.margo_tcp, + &margo_use_tcp); rc = margo_server_rpc_init(); if (rc != UNIFYFS_SUCCESS) { LOGERR("%s", unifyfs_rc_enum_description(rc)); exit(1); } - LOGDBG("connecting rpc servers"); - rc = margo_connect_servers(); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("%s", unifyfs_rc_enum_description(rc)); - exit(1); - } - - /* launch the service manager */ + /* launch the service manager (note: must happen after ABT_init) */ LOGDBG("launching service manager thread"); rc = svcmgr_init(); if (rc != (int)UNIFYFS_SUCCESS) { @@ -401,6 +390,13 @@ int main(int argc, char* argv[]) exit(1); } + LOGDBG("connecting rpc servers"); + rc = margo_connect_servers(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("%s", unifyfs_rc_enum_description(rc)); + exit(1); + } + /* initialize our tree that maps a gfid to its extent tree */ unifyfs_inode_tree_init(global_inode_tree); @@ -596,6 +592,7 @@ static int unifyfs_exit(void) int ret = UNIFYFS_SUCCESS; /* iterate over each active application and free resources */ + LOGDBG("cleaning application state"); ABT_mutex_lock(app_configs_abt_sync); for (int i = 0; i < MAX_NUM_APPS; i++) { /* get pointer to app config for this app_id */ @@ -654,8 +651,13 @@ app_config* get_application(int app_id) } /* insert a new app config in app_configs[] */ -app_config* new_application(int app_id) +app_config* new_application(int app_id, + int* created) { + if (NULL != created) { + *created = 0; + } + ABT_mutex_lock(app_configs_abt_sync); /* don't have an app_config for this app_id, @@ -683,6 +685,9 @@ app_config* new_application(int app_id) new_app->clients_sz = clients_per_app; app_configs[i] = new_app; ABT_mutex_unlock(app_configs_abt_sync); + if (NULL != created) { + *created = 1; + } return new_app; } else if (existing->app_id == app_id) { /* someone beat us to it, use existing */ diff --git a/server/src/unifyfs_server_pid.c b/server/src/unifyfs_server_pid.c index a11362386..df71622d5 100644 --- a/server/src/unifyfs_server_pid.c +++ b/server/src/unifyfs_server_pid.c @@ -21,10 +21,12 @@ #include "unifyfs_configurator.h" #include "unifyfs_global.h" #include "margo_server.h" +#include "unifyfs_p2p_rpc.h" #include "unifyfs_server_rpcs.h" extern unifyfs_cfg_t server_cfg; +static int n_servers_reported; // = 0 static int* server_pids; // = NULL static pthread_cond_t server_pid_cond = PTHREAD_COND_INITIALIZER; static pthread_mutex_t server_pid_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -45,86 +47,6 @@ static int alloc_server_pids(void) return ret; } -static int server_pid_invoke_rpc(void) -{ - int ret = 0; - hg_return_t hret = 0; - hg_handle_t handle; - server_pid_in_t in; - server_pid_out_t out; - - in.rank = glb_pmi_rank; - in.pid = server_pid; - - hret = margo_create(unifyfsd_rpc_context->svr_mid, - glb_servers[0].margo_svr_addr, - unifyfsd_rpc_context->rpcs.server_pid_id, - &handle); - if (hret != HG_SUCCESS) { - LOGERR("failed to create rpc handle (ret=%d)", hret); - return UNIFYFS_ERROR_MARGO; - } - - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("failed to forward rpc (ret=%d)", hret); - return UNIFYFS_ERROR_MARGO; - } - - hret = margo_get_output(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("failed to get rpc result (ret=%d)", hret); - return UNIFYFS_ERROR_MARGO; - } - - ret = out.ret; - - margo_free_output(handle, &out); - margo_destroy(handle); - - return ret; -} - -static void server_pid_rpc(hg_handle_t handle) -{ - int ret = 0; - hg_return_t hret = 0; - server_pid_in_t in; - server_pid_out_t out; - - hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("failed to get input (ret=%d)", hret); - return; - } - - ret = alloc_server_pids(); - if (ret) { - LOGERR("failed to allocate pid array"); - return; - } - assert((int)in.rank < glb_pmi_size); - pthread_mutex_lock(&server_pid_mutex); - server_pids[in.rank] = (int) in.pid; - pthread_mutex_unlock(&server_pid_mutex); - - out.ret = 0; - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("failed to respond rpc (ret=%d)", hret); - return; - } - - margo_free_input(handle, &in); - margo_destroy(handle); - - ret = pthread_cond_signal(&server_pid_cond); - if (ret) { - LOGERR("failed to signal condition (%s)", strerror(ret)); - } -} -DEFINE_MARGO_RPC_HANDLER(server_pid_rpc); - static inline int set_pidfile_timeout(void) { int ret = 0; @@ -175,17 +97,36 @@ static int create_server_pid_file(void) return ret; } +int unifyfs_report_server_pid(int rank, int pid) +{ + assert(rank < glb_pmi_size); + + int ret = alloc_server_pids(); + if (ret) { + LOGERR("failed to allocate pid array"); + return ret; + } + + pthread_mutex_lock(&server_pid_mutex); + n_servers_reported++; + server_pids[rank] = pid; + pthread_cond_signal(&server_pid_cond); + pthread_mutex_unlock(&server_pid_mutex); + + return UNIFYFS_SUCCESS; +} + int unifyfs_publish_server_pids(void) { int ret = UNIFYFS_SUCCESS; if (glb_pmi_rank > 0) { /* publish my pid to server 0 */ - ret = server_pid_invoke_rpc(); + ret = unifyfs_invoke_server_pid_rpc(); if (ret) { LOGERR("failed to invoke pid rpc (%s)", strerror(ret)); } - } else { + } else { /* rank 0 acts as coordinator */ ret = alloc_server_pids(); if (ret) { return ret; @@ -198,35 +139,33 @@ int unifyfs_publish_server_pids(void) pthread_mutex_lock(&server_pid_mutex); server_pids[0] = server_pid; + n_servers_reported++; /* keep checking count of reported servers until all have reported * or we hit the timeout */ - do { - int count = 0; - for (int i = 0; i < glb_pmi_size; i++) { - if (server_pids[i] > 0) { - count++; - } - } - if (count == glb_pmi_size) { - ret = create_server_pid_file(); - if (UNIFYFS_SUCCESS == ret) { - LOGDBG("servers ready to accept client connections"); - } - break; - } + while (n_servers_reported < glb_pmi_size) { ret = pthread_cond_timedwait(&server_pid_cond, &server_pid_mutex, &server_pid_timeout); if (ETIMEDOUT == ret) { - LOGERR("some servers failed to initialize within timeout"); + LOGERR("server initialization timeout"); break; } else if (ret) { LOGERR("failed to wait on condition (err=%d, %s)", errno, strerror(errno)); break; } - } while (1); + } + + if (n_servers_reported == glb_pmi_size) { + ret = create_server_pid_file(); + if (UNIFYFS_SUCCESS == ret) { + LOGDBG("servers ready to accept client connections"); + } + } else { + LOGERR("%d of %d servers reported their pids", + n_servers_reported, glb_pmi_size); + } free(server_pids); server_pids = NULL; diff --git a/server/src/unifyfs_service_manager.c b/server/src/unifyfs_service_manager.c index 557e14a2c..53961c308 100644 --- a/server/src/unifyfs_service_manager.c +++ b/server/src/unifyfs_service_manager.c @@ -28,6 +28,8 @@ */ #include "unifyfs_global.h" +#include "unifyfs_group_rpc.h" +#include "unifyfs_p2p_rpc.h" #include "unifyfs_request_manager.h" #include "unifyfs_service_manager.h" #include "unifyfs_server_rpcs.h" @@ -37,38 +39,166 @@ typedef struct { /* the SM thread */ pthread_t thrd; + pid_t tid; - /* argobots mutex for synchronizing access to request state between - * margo rpc handler ULTs and SM thread */ - ABT_mutex sync; + /* pthread mutex and condition variable for work notification */ + pthread_mutex_t thrd_lock; + pthread_cond_t thrd_cond; /* thread status */ int initialized; + int waiting_for_work; volatile int time_to_exit; /* thread return status code */ int sm_exit_rc; + /* argobots mutex for synchronizing access to request state between + * margo rpc handler ULTs and SM thread */ + ABT_mutex reqs_sync; + /* list of chunk read requests from remote servers */ arraylist_t* chunk_reads; + /* list of service requests (server_rpc_req_t*) */ + arraylist_t* svc_reqs; + } svcmgr_state_t; svcmgr_state_t* sm; // = NULL -/* lock macro for debugging SM locking */ #define SM_LOCK() \ do { \ - LOGDBG("locking service manager state"); \ - ABT_mutex_lock(sm->sync); \ + if ((NULL != sm) && sm->initialized) { \ + /*LOGDBG("locking SM state");*/ \ + pthread_mutex_lock(&(sm->thrd_lock)); \ + } \ } while (0) -/* unlock macro for debugging SM locking */ #define SM_UNLOCK() \ do { \ - LOGDBG("unlocking service manager state"); \ - ABT_mutex_unlock(sm->sync); \ + if ((NULL != sm) && sm->initialized) { \ + /*LOGDBG("unlocking SM state");*/ \ + pthread_mutex_unlock(&(sm->thrd_lock)); \ + } \ } while (0) +#define SM_REQ_LOCK() \ +do { \ + if ((NULL != sm) && sm->initialized) { \ + /*LOGDBG("locking SM requests");*/ \ + ABT_mutex_lock(sm->reqs_sync); \ + } \ +} while (0) + +#define SM_REQ_UNLOCK() \ +do { \ + if ((NULL != sm) && sm->initialized) { \ + /*LOGDBG("unlocking SM requests");*/ \ + ABT_mutex_unlock(sm->reqs_sync); \ + } \ +} while (0) + +/* initialize and launch service manager thread */ +int svcmgr_init(void) +{ + /* allocate a service manager struct, + * store in global variable */ + sm = (svcmgr_state_t*)calloc(1, sizeof(svcmgr_state_t)); + if (NULL == sm) { + LOGERR("failed to allocate service manager state!"); + return ENOMEM; + } + + /* initialize lock for shared data structures of the + * request manager */ + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + int rc = pthread_mutex_init(&(sm->thrd_lock), &attr); + if (rc != 0) { + LOGERR("pthread_mutex_init failed for service manager rc=%d (%s)", + rc, strerror(rc)); + svcmgr_fini(); + return rc; + } + + /* initialize condition variable to synchronize work + * notifications for the request manager thread */ + rc = pthread_cond_init(&(sm->thrd_cond), NULL); + if (rc != 0) { + LOGERR("pthread_cond_init failed for service manager rc=%d (%s)", + rc, strerror(rc)); + pthread_mutex_destroy(&(sm->thrd_lock)); + svcmgr_fini(); + return rc; + } + + ABT_mutex_create(&(sm->reqs_sync)); + + /* allocate a list to track chunk reads */ + sm->chunk_reads = arraylist_create(0); + if (sm->chunk_reads == NULL) { + LOGERR("failed to allocate service manager chunk_reads!"); + svcmgr_fini(); + return ENOMEM; + } + + /* allocate a list to track service requests */ + sm->svc_reqs = arraylist_create(0); + if (sm->svc_reqs == NULL) { + LOGERR("failed to allocate service manager svc_reqs!"); + svcmgr_fini(); + return ENOMEM; + } + + sm->tid = -1; + sm->initialized = 1; + + rc = pthread_create(&(sm->thrd), NULL, service_manager_thread, (void*)sm); + if (rc != 0) { + LOGERR("failed to create service manager thread"); + svcmgr_fini(); + return UNIFYFS_ERROR_THRDINIT; + } + + return UNIFYFS_SUCCESS; +} + +/* join service manager thread (if created) and clean up state */ +int svcmgr_fini(void) +{ + if (NULL != sm) { + if (sm->initialized) { + /* join thread before cleaning up state */ + if (sm->tid != -1) { + pthread_mutex_lock(&(sm->thrd_lock)); + sm->time_to_exit = 1; + pthread_cond_signal(&(sm->thrd_cond)); + pthread_mutex_unlock(&(sm->thrd_lock)); + pthread_join(sm->thrd, NULL); + } + } + + if (NULL != sm->chunk_reads) { + arraylist_free(sm->chunk_reads); + } + + if (NULL != sm->svc_reqs) { + arraylist_free(sm->svc_reqs); + } + + if (sm->initialized) { + pthread_mutex_destroy(&(sm->thrd_lock)); + pthread_cond_destroy(&(sm->thrd_cond)); + } + + /* free the service manager struct allocated during init */ + free(sm); + sm = NULL; + } + return UNIFYFS_SUCCESS; +} + /* Decode and issue chunk-reads received from request manager. * We get a list of read requests for data on our node. Read * data for each request and construct a set of read replies @@ -87,24 +217,11 @@ int sm_issue_chunk_reads(int src_rank, int src_client_id, int src_req_id, int num_chks, + size_t total_data_sz, char* msg_buf) { - /* get pointer to start of receive buffer */ - char* ptr = msg_buf; - - /* advance past command */ - ptr += sizeof(int); - - /* extract number of chunk read requests */ - assert(num_chks == *((int*)ptr)); - ptr += sizeof(int); - - /* total data size we'll be reading */ - size_t total_data_sz = *((size_t*)ptr); - ptr += sizeof(size_t); - /* get pointer to read request array */ - chunk_read_req_t* reqs = (chunk_read_req_t*)ptr; + chunk_read_req_t* reqs = (chunk_read_req_t*)msg_buf; /* we'll allocate a buffer to hold a list of chunk read response * structures, one for each chunk, followed by a data buffer @@ -118,7 +235,7 @@ int sm_issue_chunk_reads(int src_rank, // NOTE: calloc() is required here, don't use malloc char* crbuf = (char*) calloc(1, buf_sz); if (NULL == crbuf) { - LOGERR("failed to allocate chunk_read_reqs"); + LOGERR("failed to allocate chunk_read_reqs (buf_sz=%zu)", buf_sz); return ENOMEM; } @@ -158,6 +275,7 @@ int sm_issue_chunk_reads(int src_rank, for (i = 0; i < num_chks; i++) { /* pointer to next read request */ chunk_read_req_t* rreq = reqs + i; + debug_print_chunk_read_req(rreq); /* pointer to next read response */ chunk_read_resp_t* rresp = resp + i; @@ -171,8 +289,6 @@ int sm_issue_chunk_reads(int src_rank, rresp->read_rc = 0; rresp->nbytes = nbytes; rresp->offset = rreq->offset; - LOGDBG("reading chunk(offset=%zu, size=%zu)", - rreq->offset, nbytes); /* get pointer to next position in buffer to store read data */ char* buf_ptr = databuf + buf_cursor; @@ -193,9 +309,13 @@ int sm_issue_chunk_reads(int src_rank, rresp->read_rc = (ssize_t)(-rc); } } else { + LOGERR("app client [%d:%d] has NULL logio context", + app_id, cli_id); rresp->read_rc = (ssize_t)(-EINVAL); } } else { + LOGERR("failed to get application client [%d:%d] state", + app_id, cli_id); rresp->read_rc = (ssize_t)(-EINVAL); } @@ -205,14 +325,13 @@ int sm_issue_chunk_reads(int src_rank, if (src_rank != glb_pmi_rank) { /* we need to send these read responses to another rank, - * add chunk_reads to svcmgr response list and another - * thread will take care of that */ + * add chunk_reads to svcmgr response list */ LOGDBG("adding to svcmgr chunk_reads"); assert(NULL != sm); - SM_LOCK(); + SM_REQ_LOCK(); arraylist_add(sm->chunk_reads, scr); - SM_UNLOCK(); + SM_REQ_UNLOCK(); /* scr will be freed later by the sending thread */ @@ -224,7 +343,7 @@ int sm_issue_chunk_reads(int src_rank, int rc = rm_post_chunk_read_responses(src_app_id, src_client_id, src_rank, src_req_id, num_chks, buf_sz, crbuf); - if (rc != (int)UNIFYFS_SUCCESS) { + if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to handle chunk read responses"); } @@ -235,77 +354,150 @@ int sm_issue_chunk_reads(int src_rank, } } -/* initialize and launch service manager thread */ -int svcmgr_init(void) +int sm_laminate(int gfid) { - /* allocate a service manager struct, - * store in global variable */ - sm = (svcmgr_state_t*)calloc(1, sizeof(svcmgr_state_t)); - if (NULL == sm) { - LOGERR("failed to allocate service manager state!"); - return ENOMEM; + int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); + + int ret = unifyfs_inode_laminate(gfid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to laminate gfid=%d (rc=%d, is_owner=%d)", + gfid, ret, is_owner); + } else if (is_owner) { + /* I'm the owner, tell the rest of the servers */ + ret = unifyfs_invoke_broadcast_laminate(gfid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("laminate broadcast failed"); + } } + return ret; +} - /* allocate a list to track chunk read requests */ - sm->chunk_reads = arraylist_create(0); - if (sm->chunk_reads == NULL) { - LOGERR("failed to allocate service manager chunk_reads!"); - svcmgr_fini(); - return ENOMEM; +int sm_get_fileattr(int gfid, + unifyfs_file_attr_t* attrs) +{ + int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); + + /* do local inode metadata lookup */ + int ret = unifyfs_inode_metaget(gfid, attrs); + if (ret) { + if (ret != ENOENT) { + LOGERR("failed to get attributes for gfid=%d (rc=%d, is_owner=%d)", + gfid, ret, is_owner); + } } + return ret; +} - ABT_mutex_create(&(sm->sync)); - - sm->initialized = 1; - - int rc = pthread_create(&(sm->thrd), NULL, - service_manager_thread, (void*)sm); - if (rc != 0) { - LOGERR("failed to create service manager thread"); - svcmgr_fini(); - return (int)UNIFYFS_ERROR_THRDINIT; +int sm_set_fileattr(int gfid, + int file_op, + unifyfs_file_attr_t* attrs) +{ + int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); + + /* set local metadata for target file */ + int ret = unifyfs_inode_metaset(gfid, file_op, attrs); + if (ret) { + if ((ret == EEXIST) && (file_op == UNIFYFS_FILE_ATTR_OP_CREATE)) { + LOGWARN("create requested for existing gfid=%d", gfid); + } else { + LOGERR("failed to set attributes for gfid=%d (rc=%d, is_owner=%d)", + gfid, ret, is_owner); + } } - - return (int)UNIFYFS_SUCCESS; + return ret; } -/* join service manager thread (if created) and clean up state */ -int svcmgr_fini(void) +int sm_add_extents(int gfid, + size_t num_extents, + struct extent_tree_node* extents) { - if (NULL != sm) { - if (sm->thrd) { - sm->time_to_exit = 1; - pthread_join(sm->thrd, NULL); - } + int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); + + unsigned int n_extents = (unsigned int)num_extents; + int ret = unifyfs_inode_add_extents(gfid, n_extents, extents); + if (ret) { + LOGERR("failed to add %u extents to gfid=%d (rc=%d, is_owner=%d)", + n_extents, gfid, ret, is_owner); + } + return ret; +} - if (sm->initialized) { - SM_LOCK(); +int sm_find_extents(int gfid, + size_t num_extents, + unifyfs_inode_extent_t* extents, + unsigned int* out_num_chunks, + chunk_read_req_t** out_chunks) +{ + int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); + + /* do local inode metadata lookup to check for laminated */ + unifyfs_file_attr_t attrs; + int ret = unifyfs_inode_metaget(gfid, &attrs); + if (ret == UNIFYFS_SUCCESS) { + /* do local lookup */ + if (is_owner || attrs.is_laminated) { + unsigned int n_extents = (unsigned int)num_extents; + ret = unifyfs_inode_resolve_extent_chunks(n_extents, extents, + out_num_chunks, + out_chunks); + if (ret) { + LOGERR("failed to find extents for gfid=%d (rc=%d)", + gfid, ret); + } else if (*out_num_chunks == 0) { + LOGDBG("extent lookup found no matching chunks"); + } + } else { + LOGWARN("cannot find extents for unlaminated file at non-owner"); + ret = UNIFYFS_FAILURE; } + } + return ret; +} - arraylist_free(sm->chunk_reads); - - if (sm->initialized) { - SM_UNLOCK(); +int sm_truncate(int gfid, size_t filesize) +{ + int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); + + unifyfs_file_attr_t attrs; + int ret = unifyfs_inode_metaget(gfid, &attrs); + if (ret == UNIFYFS_SUCCESS) { + /* apply truncation to local file state */ + size_t old_size = (size_t) attrs.size; + LOGDBG("truncate - gfid=%d size=%zu old-size=%zu", + gfid, filesize, old_size); + int ret = unifyfs_inode_truncate(gfid, (unsigned long)filesize); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("truncate(gfid=%d, size=%zu) failed", + gfid, filesize); + } else if (is_owner && (filesize < old_size)) { + /* truncate the target file at other servers */ + ret = unifyfs_invoke_broadcast_truncate(gfid, filesize); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("truncate broadcast failed"); + } } - - /* free the service manager struct allocated during init */ - free(sm); - sm = NULL; } - return (int)UNIFYFS_SUCCESS; + return ret; } + /* iterate over list of chunk reads and send responses */ static int send_chunk_read_responses(void) { /* assume we'll succeed */ - int rc = (int)UNIFYFS_SUCCESS; + int rc = UNIFYFS_SUCCESS; /* this will hold a list of chunk read requests if we find any */ arraylist_t* chunk_reads = NULL; /* lock to access global service manager object */ - ABT_mutex_lock(sm->sync); + SM_REQ_LOCK(); /* if we have any chunk reads, take pointer to the list * of chunk read requests and replace it with a newly allocated @@ -320,7 +512,7 @@ static int send_chunk_read_responses(void) } /* release lock on service manager object */ - ABT_mutex_unlock(sm->sync); + SM_REQ_UNLOCK(); /* iterate over each chunk read request */ for (int i = 0; i < num_chunk_reads; i++) { @@ -339,216 +531,673 @@ static int send_chunk_read_responses(void) return rc; } -/* Entry point for service manager thread. The SM thread - * runs in a loop processing read request replies until - * the main server thread asks it to exit. The read requests - * themselves are handled by Margo RPC threads. - * - * @param arg: pointer to SM thread control structure - * @return NULL */ -void* service_manager_thread(void* arg) +static inline void signal_new_requests(void) { - int rc; + pid_t this_thread = unifyfs_gettid(); + if (this_thread != sm->tid) { + /* signal svcmgr to begin processing the requests we just added */ + LOGDBG("signaling new service requests"); + pthread_cond_signal(&(sm->thrd_cond)); + } +} - LOGDBG("I am the service manager thread!"); - assert(sm == (svcmgr_state_t*)arg); +/* submit a request to the service manager thread */ +int sm_submit_service_request(server_rpc_req_t* req) +{ + if ((NULL == sm) || (NULL == sm->svc_reqs)) { + return UNIFYFS_FAILURE; + } - /* handle chunk reads until signaled to exit */ - while (1) { - rc = send_chunk_read_responses(); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to send chunk read responses"); - } + SM_REQ_LOCK(); + arraylist_add(sm->svc_reqs, req); + SM_REQ_UNLOCK(); - if (sm->time_to_exit) { - break; + signal_new_requests(); + + return UNIFYFS_SUCCESS; +} + +static int process_chunk_read_rpc(server_rpc_req_t* req) +{ + int ret; + chunk_read_request_in_t* in = req->input; + + /* issue chunk read requests */ + int src_rank = (int)in->src_rank; + int app_id = (int)in->app_id; + int client_id = (int)in->client_id; + int req_id = (int)in->req_id; + int num_chks = (int)in->num_chks; + size_t total_sz = (size_t)in->total_data_size; + + LOGDBG("handling chunk read requests from server[%d]: " + "req=%d num_chunks=%d data_sz=%zu bulk_sz=%zu", + src_rank, req_id, num_chks, total_sz, req->bulk_sz); + + ret = sm_issue_chunk_reads(src_rank, app_id, client_id, + req_id, num_chks, total_sz, + (char*)req->bulk_buf); + + margo_free_input(req->handle, in); + free(in); + free(req->bulk_buf); + + /* send rpc response */ + chunk_read_request_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_add_extents_rpc(server_rpc_req_t* req) +{ + /* get input parameters */ + add_extents_in_t* in = req->input; + int sender = (int) in->src_rank; + int gfid = (int) in->gfid; + size_t num_extents = (size_t) in->num_extents; + struct extent_tree_node* extents = req->bulk_buf; + + /* add extents */ + LOGDBG("adding %zu extents to gfid=%d from server[%d]", + num_extents, gfid, sender); + int ret = sm_add_extents(gfid, num_extents, extents); + if (ret) { + LOGERR("failed to add extents from %d (ret=%d)", sender, ret); + } + + margo_free_input(req->handle, in); + free(in); + free(req->bulk_buf); + + /* send rpc response */ + add_extents_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_find_extents_rpc(server_rpc_req_t* req) +{ + /* get input parameters */ + find_extents_in_t* in = req->input; + int sender = (int) in->src_rank; + int gfid = (int) in->gfid; + size_t num_extents = (size_t) in->num_extents; + unifyfs_inode_extent_t* extents = req->bulk_buf; + + LOGDBG("received %zu extent lookups for gfid=%d from server[%d]", + num_extents, gfid, sender); + + /* find chunks for given extents */ + unsigned int num_chunks = 0; + chunk_read_req_t* chunk_locs = NULL; + int ret = sm_find_extents(gfid, num_extents, extents, + &num_chunks, &chunk_locs); + + margo_free_input(req->handle, in); + free(in); + free(req->bulk_buf); + + /* define a bulk handle to transfer chunk address info */ + hg_bulk_t bulk_resp_handle = HG_BULK_NULL; + if (ret == UNIFYFS_SUCCESS) { + if (num_chunks > 0) { + const struct hg_info* hgi = margo_get_info(req->handle); + assert(hgi); + margo_instance_id mid = margo_hg_info_get_instance(hgi); + assert(mid != MARGO_INSTANCE_NULL); + + void* buf = (void*) chunk_locs; + size_t buf_sz = (size_t)num_chunks * sizeof(chunk_read_req_t); + hg_return_t hret = margo_bulk_create(mid, 1, &buf, &buf_sz, + HG_BULK_READ_ONLY, + &bulk_resp_handle); + if (hret != HG_SUCCESS) { + LOGERR("margo_bulk_create() failed"); + ret = UNIFYFS_ERROR_MARGO; + } } + } + + /* send rpc response */ + find_extents_out_t out; + out.ret = (int32_t) ret; + out.num_locations = (int32_t) num_chunks; + out.locations = bulk_resp_handle; - /* wait an interval */ - usleep(MIN_USLEEP_INTERVAL); + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); } - LOGDBG("service manager thread exiting"); + if (HG_BULK_NULL != bulk_resp_handle) { + margo_bulk_free(bulk_resp_handle); + } - sm->sm_exit_rc = (int)UNIFYFS_SUCCESS; - return NULL; + /* cleanup req */ + margo_destroy(req->handle); + + return ret; } -/* BEGIN MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ +static int process_filesize_rpc(server_rpc_req_t* req) +{ + /* get target file */ + filesize_in_t* in = req->input; + int gfid = (int) in->gfid; + margo_free_input(req->handle, in); + free(in); + + /* get size of target file */ + size_t filesize; + int ret = unifyfs_inode_get_filesize(gfid, &filesize); + + /* send rpc response */ + filesize_out_t out; + out.ret = (int32_t) ret; + out.filesize = (hg_size_t) filesize; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); -/* invokes the chunk_read_response rpc, this sends a set of read - * reply headers and corresponding data back to a server that - * had requested we read data on its behalf, the headers and - * data are posted as a bulk transfer buffer */ -int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr) + return ret; +} + +static int process_laminate_rpc(server_rpc_req_t* req) { - /* assume we'll succeed */ - int rc = UNIFYFS_SUCCESS; + /* get target file */ + laminate_in_t* in = req->input; + int gfid = (int)in->gfid; + margo_free_input(req->handle, in); + free(in); + + /* do file lamination */ + int ret = sm_laminate(gfid); + + /* send rpc response */ + laminate_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } - /* rank of destination server */ - int dst_rank = scr->rank; - assert(dst_rank < (int)glb_num_servers); + /* cleanup req */ + margo_destroy(req->handle); - /* get address of destinaton server */ - hg_addr_t dst_addr = glb_servers[dst_rank].margo_svr_addr; + return ret; +} - /* pointer to struct containing rpc context info, - * shorter name for convience */ - ServerRpcContext_t* ctx = unifyfsd_rpc_context; +static int process_metaget_rpc(server_rpc_req_t* req) +{ + /* get target file */ + metaget_in_t* in = req->input; + int gfid = (int) in->gfid; + margo_free_input(req->handle, in); + free(in); + + /* initialize invalid attributes */ + unifyfs_file_attr_t attrs; + unifyfs_file_attr_set_invalid(&attrs); + + /* get metadata for target file */ + int ret = sm_get_fileattr(gfid, &attrs); + + /* send rpc response */ + metaget_out_t out; + out.ret = (int32_t) ret; + out.attr = attrs; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); - /* get handle to read response rpc on destination server */ - hg_handle_t handle; - hg_id_t resp_id = ctx->rpcs.chunk_read_response_id; - hg_return_t hret = margo_create(ctx->svr_mid, dst_addr, - resp_id, &handle); + return ret; + return UNIFYFS_ERROR_NYI; +} + +static int process_metaset_rpc(server_rpc_req_t* req) +{ + /* update target file metadata */ + metaset_in_t* in = req->input; + int gfid = (int) in->gfid; + int attr_op = (int) in->fileop; + unifyfs_file_attr_t* attrs = &(in->attr); + int ret = sm_set_fileattr(gfid, attr_op, attrs); + margo_free_input(req->handle, in); + free(in); + + /* send rpc response */ + metaset_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_create() failed"); - return UNIFYFS_ERROR_MARGO; + LOGERR("margo_respond() failed"); } - /* get address and size of our response buffer */ - void* data_buf = (void*)scr->resp; - hg_size_t bulk_sz = scr->total_sz; + /* cleanup req */ + margo_destroy(req->handle); - /* register our response buffer for bulk remote read access */ - chunk_read_response_in_t in; - hret = margo_bulk_create(ctx->svr_mid, 1, &data_buf, &bulk_sz, - HG_BULK_READ_ONLY, &in.bulk_handle); + return ret; +} + +static int process_server_pid_rpc(server_rpc_req_t* req) +{ + /* get input parameters */ + server_pid_in_t* in = req->input; + int src_rank = (int) in->rank; + int pid = (int) in->pid; + margo_free_input(req->handle, in); + free(in); + + /* do pid report */ + int ret = unifyfs_report_server_pid(src_rank, pid); + + /* send rpc response */ + server_pid_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - return UNIFYFS_ERROR_MARGO; + LOGERR("margo_respond() failed"); } - /* fill in input struct */ - in.src_rank = (int32_t)glb_pmi_rank; - in.app_id = (int32_t)scr->app_id; - in.client_id = (int32_t)scr->client_id; - in.req_id = (int32_t)scr->rdreq_id; - in.num_chks = (int32_t)scr->num_chunks; - in.bulk_size = bulk_sz; + /* cleanup req */ + margo_destroy(req->handle); - /* call the read response rpc */ - LOGDBG("invoking the chunk-read-response rpc function"); - hret = margo_forward(handle, &in); + return ret; +} + +static int process_truncate_rpc(server_rpc_req_t* req) +{ + /* get target file and requested file size */ + truncate_in_t* in = req->input; + int gfid = (int) in->gfid; + size_t fsize = (size_t) in->filesize; + margo_free_input(req->handle, in); + free(in); + + /* do file truncation */ + int ret = sm_truncate(gfid, fsize); + + /* send rpc response */ + truncate_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); - rc = UNIFYFS_ERROR_MARGO; - } else { - /* rpc executed, now decode response */ - chunk_read_response_out_t out; - hret = margo_get_output(handle, &out); - if (hret == HG_SUCCESS) { - rc = (int)out.ret; - LOGDBG("chunk-read-response rpc to %d - ret=%d", - dst_rank, rc); - margo_free_output(handle, &out); - } else { - LOGERR("margo_get_output() failed"); - rc = UNIFYFS_ERROR_MARGO; + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + +static int process_extents_bcast_rpc(server_rpc_req_t* req) +{ + /* get target file and extents */ + extent_bcast_in_t* in = req->input; + int gfid = (int) in->gfid; + size_t num_extents = (size_t) in->num_extents; + struct extent_tree_node* extents = req->bulk_buf; + + LOGDBG("gfid=%d num_extents=%zu", gfid, num_extents); + + /* add extents */ + int ret = sm_add_extents(gfid, num_extents, extents); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("add_extents(gfid=%d) failed - rc=%d", gfid, ret); + } + collective_set_local_retval(req->coll, ret); + + /* create a ULT to finish broadcast operation */ + ret = invoke_bcast_progress_rpc(req->coll); + + return ret; +} + +static int process_fileattr_bcast_rpc(server_rpc_req_t* req) +{ + /* get target file and attributes */ + fileattr_bcast_in_t* in = req->input; + int gfid = (int) in->gfid; + int attr_op = (int) in->attrop; + unifyfs_file_attr_t* attrs = &(in->attr); + + LOGDBG("gfid=%d", gfid); + + /* update file attributes */ + int ret = sm_set_fileattr(gfid, attr_op, attrs); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("set_fileattr(gfid=%d, op=%d) failed - rc=%d", + gfid, attr_op, ret); + } + collective_set_local_retval(req->coll, ret); + + /* create a ULT to finish broadcast operation */ + ret = invoke_bcast_progress_rpc(req->coll); + + return ret; +} + +static int process_laminate_bcast_rpc(server_rpc_req_t* req) +{ + /* get target file and extents */ + laminate_bcast_in_t* in = req->input; + int gfid = (int) in->gfid; + size_t num_extents = (size_t) in->num_extents; + unifyfs_file_attr_t* fattr = &(in->attr); + struct extent_tree_node* extents = req->bulk_buf; + + LOGDBG("gfid=%d num_extents=%zu", gfid, num_extents); + + /* update inode file attributes. first check to make sure + * inode for the gfid exists. if it doesn't, create it with + * given attrs. otherwise, just do a metadata update. */ + unifyfs_file_attr_t existing_fattr; + int ret = unifyfs_inode_metaget(gfid, &existing_fattr); + if (ret == ENOENT) { + /* create with is_laminated=0 so we can add extents */ + fattr->is_laminated = 0; + ret = unifyfs_inode_create(gfid, fattr); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("inode create during laminate(gfid=%d) failed - rc=%d", + gfid, ret); } + fattr->is_laminated = 1; } - /* free resources allocated for executing margo rpc */ - margo_bulk_free(in.bulk_handle); - margo_destroy(handle); + /* add extents */ + ret = sm_add_extents(gfid, num_extents, extents); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("extent add during laminate(gfid=%d) failed - rc=%d", + gfid, ret); + collective_set_local_retval(req->coll, ret); + } - /* free response data buffer */ - free(data_buf); - scr->resp = NULL; + /* mark as laminated with passed attributes */ + int attr_op = UNIFYFS_FILE_ATTR_OP_LAMINATE; + ret = sm_set_fileattr(gfid, attr_op, fattr); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("metaset during laminate(gfid=%d) failed - rc=%d", + gfid, ret); + collective_set_local_retval(req->coll, ret); + } - return rc; + /* create a ULT to finish broadcast operation */ + ret = invoke_bcast_progress_rpc(req->coll); + + return ret; } -/* BEGIN MARGO SERVER-SERVER RPC HANDLERS */ +static int process_truncate_bcast_rpc(server_rpc_req_t* req) +{ + /* get target file and requested file size */ + truncate_bcast_in_t* in = req->input; + int gfid = (int) in->gfid; + size_t fsize = (size_t) in->filesize; + + LOGDBG("gfid=%d size=%zu", gfid, fsize); + + /* apply truncation to local file state */ + int ret = unifyfs_inode_truncate(gfid, (unsigned long)fsize); + if (ret != UNIFYFS_SUCCESS) { + /* owner is root of broadcast tree */ + int is_owner = ((int)(in->root) == glb_pmi_rank); + if ((ret == ENOENT) && !is_owner) { + /* it's ok if inode doesn't exist at non-owners */ + ret = UNIFYFS_SUCCESS; + } else { + LOGERR("truncate(gfid=%d, size=%zu) failed - rc=%d", + gfid, fsize, ret); + } + } + collective_set_local_retval(req->coll, ret); + + /* create a ULT to finish broadcast operation */ + ret = invoke_bcast_progress_rpc(req->coll); + + return ret; +} + +static int process_unlink_bcast_rpc(server_rpc_req_t* req) +{ + /* get target file and requested file size */ + unlink_bcast_in_t* in = req->input; + int gfid = (int) in->gfid; + + LOGDBG("gfid=%d", gfid); + + /* apply truncation to local file state */ + int ret = unifyfs_inode_unlink(gfid); + if (ret != UNIFYFS_SUCCESS) { + /* owner is root of broadcast tree */ + int is_owner = ((int)(in->root) == glb_pmi_rank); + if ((ret == ENOENT) && !is_owner) { + /* it's ok if inode doesn't exist at non-owners */ + ret = UNIFYFS_SUCCESS; + } else { + LOGERR("unlink(gfid=%d) failed - rc=%d", gfid, ret); + } + } + collective_set_local_retval(req->coll, ret); + + /* create a ULT to finish broadcast operation */ + ret = invoke_bcast_progress_rpc(req->coll); -/* handler for server-server chunk read request */ -static void chunk_read_request_rpc(hg_handle_t handle) + return ret; +} + +static int process_service_requests(void) { - int32_t ret = UNIFYFS_SUCCESS; + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; - /* get input params */ - chunk_read_request_in_t in; - hg_return_t hret = margo_get_input(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* extract params from input struct */ - int src_rank = (int)in.src_rank; - int app_id = (int)in.app_id; - int client_id = (int)in.client_id; - int req_id = (int)in.req_id; - int num_chks = (int)in.num_chks; - size_t bulk_sz = (size_t)in.bulk_size; - - LOGDBG("handling chunk read request from server %d: " - "req=%d num_chunks=%d bulk_sz=%zu", - src_rank, req_id, num_chks, bulk_sz); - - /* get margo info */ - const struct hg_info* hgi = margo_get_info(handle); - assert(NULL != hgi); - - margo_instance_id mid = margo_hg_info_get_instance(hgi); - assert(mid != MARGO_INSTANCE_NULL); - - hg_bulk_t bulk_handle; - int reqcmd = (int)SVC_CMD_INVALID; - void* reqbuf = NULL; - if (bulk_sz) { - /* allocate and register local target buffer for bulk access */ - reqbuf = malloc(bulk_sz); - if (NULL != reqbuf) { - hret = margo_bulk_create(mid, 1, &reqbuf, &in.bulk_size, - HG_BULK_WRITE_ONLY, &bulk_handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* pull request data */ - hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, - in.bulk_handle, 0, - bulk_handle, 0, in.bulk_size); - if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_transfer() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* first int in request buffer is the command */ - reqcmd = *(int*)reqbuf; - - /* verify this is a request for data */ - if (reqcmd == (int)SVC_CMD_RDREQ_CHK) { - /* chunk read request command */ - LOGDBG("request command: SVC_CMD_RDREQ_CHK"); - ret = sm_issue_chunk_reads(src_rank, - app_id, client_id, - req_id, num_chks, - (char*)reqbuf); - } else { - LOGERR("invalid command %d from server %d", - reqcmd, src_rank); - ret = EINVAL; - } - } - margo_bulk_free(bulk_handle); - } - free(reqbuf); - } else { - ret = ENOMEM; + /* this will hold a list of client requests if we find any */ + arraylist_t* svc_reqs = NULL; + + /* lock to access requests */ + SM_REQ_LOCK(); + + /* if we have any requests, take pointer to the list + * of requests and replace it with a newly allocated + * list on the request manager structure */ + int num_svc_reqs = arraylist_size(sm->svc_reqs); + if (num_svc_reqs) { + /* got some client requets, take the list and replace + * it with an empty list */ + LOGDBG("processing %d service requests", num_svc_reqs); + svc_reqs = sm->svc_reqs; + sm->svc_reqs = arraylist_create(0); + } + + /* release lock on sm requests */ + SM_REQ_UNLOCK(); + + /* iterate over each client request */ + for (int i = 0; i < num_svc_reqs; i++) { + /* process next request */ + int rret; + server_rpc_req_t* req = (server_rpc_req_t*) + arraylist_get(svc_reqs, i); + switch (req->req_type) { + case UNIFYFS_SERVER_RPC_CHUNK_READ: + rret = process_chunk_read_rpc(req); + break; + case UNIFYFS_SERVER_RPC_EXTENTS_ADD: + rret = process_add_extents_rpc(req); + break; + case UNIFYFS_SERVER_RPC_EXTENTS_FIND: + rret = process_find_extents_rpc(req); + break; + case UNIFYFS_SERVER_RPC_FILESIZE: + rret = process_filesize_rpc(req); + break; + case UNIFYFS_SERVER_RPC_LAMINATE: + rret = process_laminate_rpc(req); + break; + case UNIFYFS_SERVER_RPC_METAGET: + rret = process_metaget_rpc(req); + break; + case UNIFYFS_SERVER_RPC_METASET: + rret = process_metaset_rpc(req); + break; + case UNIFYFS_SERVER_RPC_PID_REPORT: + rret = process_server_pid_rpc(req); + break; + case UNIFYFS_SERVER_RPC_TRUNCATE: + rret = process_truncate_rpc(req); + break; + case UNIFYFS_SERVER_BCAST_RPC_EXTENTS: + rret = process_extents_bcast_rpc(req); + break; + case UNIFYFS_SERVER_BCAST_RPC_FILEATTR: + rret = process_fileattr_bcast_rpc(req); + break; + case UNIFYFS_SERVER_BCAST_RPC_LAMINATE: + rret = process_laminate_bcast_rpc(req); + break; + case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: + rret = process_truncate_bcast_rpc(req); + break; + case UNIFYFS_SERVER_BCAST_RPC_UNLINK: + rret = process_unlink_bcast_rpc(req); + break; + default: + LOGERR("unsupported server rpc request type %d", req->req_type); + rret = UNIFYFS_ERROR_NYI; + break; + } + if (rret != UNIFYFS_SUCCESS) { + if ((rret != ENOENT) && (rret != EEXIST)) { + LOGERR("server rpc request %d failed (%s)", + i, unifyfs_rc_enum_description(rret)); } + ret = rret; } - margo_free_input(handle, &in); } - /* return output to caller */ - chunk_read_request_out_t out; - out.ret = ret; - hret = margo_respond(handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + /* free the list if we have one */ + if (NULL != svc_reqs) { + /* NOTE: this will call free() on each req in the arraylist */ + arraylist_free(svc_reqs); } - /* free margo resources */ - margo_destroy(handle); + return ret; +} + +/* Entry point for service manager thread. The SM thread + * runs in a loop processing read request replies until + * the main server thread asks it to exit. The read requests + * themselves are handled by Margo RPC threads. + * + * @param arg: pointer to SM thread control structure + * @return NULL */ +void* service_manager_thread(void* arg) +{ + int rc; + + sm->tid = unifyfs_gettid(); + LOGINFO("I am the service manager thread!"); + assert(sm == (svcmgr_state_t*)arg); + +#if defined(USE_SVCMGR_PROGRESS_TIMER) + int have_progress_timer = 0; + timer_t progress_timer; + struct itimerspec alarm_set = { {0}, {0} }; + struct itimerspec alarm_reset = { {0}, {0} }; + rc = timer_create(CLOCK_REALTIME, NULL, &progress_timer); + if (rc != 0) { + LOGERR("failed to create progress timer"); + } else { + have_progress_timer = 1; + alarm_set.it_value.tv_sec = 60; + } +#endif + + /* handle requests until told to exit */ + while (1) { + +#if defined(USE_SVCMGR_PROGRESS_TIMER) + if (have_progress_timer) { + /* set a progress alarm for one minute */ + rc = timer_settime(progress_timer, 0, &alarm_set, NULL); + } +#endif + + rc = process_service_requests(); + if (rc != UNIFYFS_SUCCESS) { + LOGWARN("failed to process service requests"); + } + + rc = send_chunk_read_responses(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to send chunk read responses"); + } + +#if defined(USE_SVCMGR_PROGRESS_TIMER) + if (have_progress_timer) { + /* cancel progress alarm */ + rc = timer_settime(progress_timer, 0, &alarm_reset, NULL); + } +#endif + + /* inform dispatcher that we're waiting for work + * inside the critical section */ + SM_LOCK(); + sm->waiting_for_work = 1; + + /* release lock and wait to be signaled by dispatcher */ + //LOGDBG("SM waiting for work"); + struct timespec timeout; + clock_gettime(CLOCK_REALTIME, &timeout); + timeout.tv_nsec += 50000000; /* 50 ms */ + if (timeout.tv_nsec >= 1000000000) { + timeout.tv_nsec -= 1000000000; + timeout.tv_sec++; + } + int wait_rc = pthread_cond_timedwait(&(sm->thrd_cond), + &(sm->thrd_lock), + &timeout); + if (0 == wait_rc) { + LOGDBG("SM got work"); + } else if (ETIMEDOUT != wait_rc) { + LOGERR("SM work condition wait failed (rc=%d)", wait_rc); + } + + /* set flag to indicate we're no longer waiting */ + sm->waiting_for_work = 0; + SM_UNLOCK(); + + if (sm->time_to_exit) { + break; + } + } + + LOGDBG("service manager thread exiting"); + + sm->sm_exit_rc = UNIFYFS_SUCCESS; + return NULL; } -DEFINE_MARGO_RPC_HANDLER(chunk_read_request_rpc) diff --git a/server/src/unifyfs_service_manager.h b/server/src/unifyfs_service_manager.h index a2a9c9f2a..05cc441fd 100644 --- a/server/src/unifyfs_service_manager.h +++ b/server/src/unifyfs_service_manager.h @@ -32,6 +32,15 @@ #include "unifyfs_global.h" +typedef struct { + server_rpc_e req_type; + hg_handle_t handle; + void* coll; + void* input; + void* bulk_buf; + size_t bulk_sz; +} server_rpc_req_t; + /* service manager pthread routine */ void* service_manager_thread(void* ctx); @@ -41,15 +50,46 @@ int svcmgr_init(void); /* join service manager thread and cleanup its state */ int svcmgr_fini(void); +/** + * @brief submit a server rpc request to the service manager thread. + * + * @param req pointer to server rpc request struct + * + * @return UNIFYFS_SUCCESS, or error code + */ +int sm_submit_service_request(server_rpc_req_t* req); + /* decode and issue chunk reads contained in message buffer */ int sm_issue_chunk_reads(int src_rank, int src_app_id, int src_client_id, int src_req_id, int num_chks, + size_t total_data_sz, char* msg_buf); -/* MARGO SERVER-SERVER RPC INVOCATION FUNCTIONS */ -int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr); +/* File service operations */ + +int sm_laminate(int gfid); + +int sm_get_fileattr(int gfid, + unifyfs_file_attr_t* attrs); + +int sm_set_fileattr(int gfid, + int file_op, + unifyfs_file_attr_t* attrs); + +int sm_add_extents(int gfid, + size_t num_extents, + struct extent_tree_node* extents); + +int sm_find_extents(int gfid, + size_t num_extents, + unifyfs_inode_extent_t* extents, + unsigned int* out_num_chunks, + chunk_read_req_t** out_chunks); + +int sm_truncate(int gfid, + size_t filesize); #endif // UNIFYFS_SERVICE_MANAGER_H diff --git a/util/unifyfs/src/unifyfs-rm.c b/util/unifyfs/src/unifyfs-rm.c index 6301c8890..0e6368fa1 100644 --- a/util/unifyfs/src/unifyfs-rm.c +++ b/util/unifyfs/src/unifyfs-rm.c @@ -1019,22 +1019,24 @@ static int srun_launch(unifyfs_resource_t* resource, size_t argc, srun_argc, server_argc; char** argv = NULL; char n_nodes[16]; + char n_cores[8]; - // full command: srun - - srun_argc = 5; - snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); + snprintf(n_cores, sizeof(n_cores), "-c%d", resource->n_cores_per_server); + snprintf(n_nodes, sizeof(n_nodes), "-N%zu", resource->n_nodes); + // full command: srun + srun_argc = 6; server_argc = construct_server_argv(args, NULL); // setup full command argv argc = 1 + srun_argc + server_argc; argv = calloc(argc, sizeof(char*)); argv[0] = strdup("srun"); - argv[1] = strdup("-N"); - argv[2] = strdup(n_nodes); - argv[3] = strdup("--ntasks-per-node"); - argv[4] = strdup("1"); + argv[1] = strdup("--exact"); + argv[2] = strdup("--overcommit"); + argv[3] = strdup(n_nodes); + argv[4] = strdup("--ntasks-per-node=1"); + argv[5] = strdup(n_cores); construct_server_argv(args, argv + srun_argc); execvp(argv[0], argv); @@ -1057,18 +1059,17 @@ static int srun_terminate(unifyfs_resource_t* resource, char** argv = NULL; char n_nodes[16]; + snprintf(n_nodes, sizeof(n_nodes), "-N%zu", resource->n_nodes); + // full command: srun pkill -n unifyfsd srun_argc = 8; - snprintf(n_nodes, sizeof(n_nodes), "%zu", resource->n_nodes); - - // setup full command argv argc = 1 + srun_argc; argv = calloc(argc, sizeof(char*)); argv[0] = strdup("srun"); - argv[1] = strdup("-N"); - argv[2] = strdup(n_nodes); - argv[3] = strdup("-n"); - argv[4] = strdup(n_nodes); + argv[1] = strdup("--exact"); + argv[2] = strdup("--overcommit"); + argv[3] = strdup(n_nodes); + argv[4] = strdup("--ntasks-per-node=1"); argv[5] = strdup("pkill"); argv[6] = strdup("-n"); argv[7] = strdup("unifyfsd"); @@ -1089,11 +1090,11 @@ static int srun_terminate(unifyfs_resource_t* resource, static int srun_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) { - size_t srun_argc = 5; + size_t srun_argc = 3; char cmd[200]; // full command: srun - snprintf(cmd, sizeof(cmd), "srun -N %zu --ntasks-per-node 1", + snprintf(cmd, sizeof(cmd), "srun -N%zu --ntasks-per-node=1", resource->n_nodes); generic_stage(cmd, srun_argc, args); From a2536812641279b239c4a26b3d241dd7dc4d2aa9 Mon Sep 17 00:00:00 2001 From: CamStan Date: Fri, 11 Jun 2021 18:54:14 -0700 Subject: [PATCH 13/81] Bugfix: unlink global file not found locally Moved shared logic for rmdir, unlink, and remove wrappers to a function that will remove a given target given considerations from the given file mode mask. Add logic to unlink/remove/rmdir a file/dir that exists globally when it is not found locally. Add option to reuse a file name, and a function to remove a file to testutil.h. Add check for reusing a file name to write, writeread, and cr examples. Change default `UNIFYFS_LOGIO_SPILL_SIZE` to 4GIB. Minor documentation clarifications. --- client/src/unifyfs-sysio.c | 172 +++++++++++++++++++----------- common/src/unifyfs_const.h | 2 +- docs/build.rst | 10 +- docs/configuration.rst | 2 +- docs/testing.rst | 6 +- examples/src/checkpoint-restart.c | 15 ++- examples/src/testutil.h | 61 ++++++++++- examples/src/write.c | 17 ++- examples/src/writeread.c | 13 ++- 9 files changed, 215 insertions(+), 83 deletions(-) diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index 8d93f7429..01d4cdb85 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -109,42 +109,124 @@ int UNIFYFS_WRAP(mkdir)(const char* path, mode_t mode) } } -int UNIFYFS_WRAP(rmdir)(const char* path) +/* Function for shared logic between rmdir, remove, and unlink wrappers. + * + * Remove the target at upath based on the input file mode indicated by mask. + * If mask == 0 (e.g., remove wrapper made request), the target doesn't need + * any mode specific checks before attempting to remove. + * */ +static int unifyfs_remove(const char* upath, mode_t mask) { - /* determine whether we should intercept this path */ - char upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(path, upath)) { - /* check if the mount point itself is being deleted */ - if (!strcmp(upath, unifyfs_mount_prefix)) { - errno = EBUSY; + /* check if the mount point itself is being deleted */ + if (!strcmp(upath, unifyfs_mount_prefix)) { + errno = EBUSY; + return -1; + } + + /* check if path exists locally */ + int fid = unifyfs_get_fid_from_path(upath); + if (fid >= 0) { + /* found path locally */ + int is_dir = unifyfs_fid_is_dir(fid); + + /* is it a directory? */ + if (is_dir) { + /* check if remove request was made for a regular file */ + if (mask & S_IFREG) { + /* ERROR: unlink likely made this request but path is a dir */ + LOGDBG("Attempting to unlink a directory %s in UNIFYFS", upath); + errno = EISDIR; + return -1; + } + /* remove/rmdir likely made this request (mask & (0 | S_IFDIR)) */ + + /* is it empty? */ + if (!unifyfs_fid_is_dir_empty(upath)) { + /* ERROR: is a directory, but isn't empty */ + LOGDBG("Attempting to remove non-empty dir %s in UNIFYFS", + upath); + errno = ENOTEMPTY; + return -1; + } + } else { /* not a directory */ + /* check if remove request was for a directory */ + if (mask & S_IFDIR) { + /* ERROR: rmdir likely made this request but path not a dir */ + LOGDBG("Attempting to rmdir a non-dir %s in UNIFYFS", upath); + errno = ENOTDIR; + return -1; + } + } + + /* remove the target from the file list */ + int ret = unifyfs_fid_unlink(fid); + if (ret != UNIFYFS_SUCCESS) { + /* failed to remove the target, + * set errno and return */ + LOGDBG("remove failed on %s in UNIFYFS", upath); + errno = unifyfs_rc_errno(ret); return -1; } + } else { + /* path doesn't exist locally, but may exist globally */ + int gfid = unifyfs_generate_gfid(upath); + unifyfs_file_attr_t attr = {0}; - /* check if path exists */ - int fid = unifyfs_get_fid_from_path(upath); - if (fid < 0) { + int ret = unifyfs_get_global_file_meta(gfid, &attr); + if (ret != UNIFYFS_SUCCESS) { + /* ERROR: path doesn't exist locally or globally */ + LOGDBG("Couldn't find entry for %s in UNIFYFS", upath); errno = ENOENT; return -1; } /* is it a directory? */ - if (!unifyfs_fid_is_dir(fid)) { - errno = ENOTDIR; - return -1; + if (attr.mode & S_IFDIR) { + /* check if remove request was for a regular file */ + if (mask & S_IFREG) { + /* ERROR: unlink likely made this request but path is a dir */ + LOGDBG("Attempting to unlink a directory %s in UNIFYFS", upath); + errno = EISDIR; + return -1; + } + + /* Current directory structure assumes all directories are empty. + * If wanting to enforce directory structure, will need logic to + * globally check whether a directory is empty. + * Possibly an rpc to check if_dir and if_empty at once to avoid + * status change between rpc calls.*/ + + } else { /* not a directory */ + /* check if remove request was for a directory */ + if ((mask & S_IFDIR)) { + LOGDBG("Attempting to rmdir a non-dir %s in UNIFYFS", upath); + errno = ENOTDIR; + return -1; + } } - /* is it empty? */ - if (!unifyfs_fid_is_dir_empty(upath)) { - errno = ENOTEMPTY; + /* delete the target */ + ret = invoke_client_unlink_rpc(gfid); + if (ret != UNIFYFS_SUCCESS) { + LOGDBG("unlink rpc failed on %s in UNIFYFS", upath); + errno = unifyfs_rc_errno(ret); return -1; } + } - /* remove the directory from the file list */ - int ret = unifyfs_fid_unlink(fid); + /* success */ + return 0; +} + +int UNIFYFS_WRAP(rmdir)(const char* path) +{ + /* determine whether we should intercept this path */ + char upath[UNIFYFS_MAX_FILENAME]; + if (unifyfs_intercept_path(path, upath)) { + /* call shared logic function with S_IFDIR mask */ + int ret = unifyfs_remove(upath, S_IFDIR); if (ret != UNIFYFS_SUCCESS) { - /* failed to remove the directory, - * set errno and return */ - errno = unifyfs_rc_errno(ret); + LOGDBG("rmdir() failed on %s in UNIFYFS", upath); return -1; } @@ -575,27 +657,10 @@ int UNIFYFS_WRAP(unlink)(const char* path) /* determine whether we should intercept this path or not */ char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { - /* get file id for path name */ - int fid = unifyfs_get_fid_from_path(upath); - if (fid < 0) { - /* ERROR: file does not exist */ - LOGDBG("Couldn't find entry for %s in UNIFYFS", upath); - errno = ENOENT; - return -1; - } - - /* check that it's not a directory */ - if (unifyfs_fid_is_dir(fid)) { - /* ERROR: is a directory */ - LOGDBG("Attempting to unlink a directory %s in UNIFYFS", upath); - errno = EISDIR; - return -1; - } - - /* delete the file */ - int ret = unifyfs_fid_unlink(fid); + /* call shared logic function with S_IFREG mask */ + int ret = unifyfs_remove(upath, S_IFREG); if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_rc_errno(ret); + LOGDBG("unlink() failed on %s in UNIFYFS", upath); return -1; } @@ -613,29 +678,10 @@ int UNIFYFS_WRAP(remove)(const char* path) /* determine whether we should intercept this path or not */ char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { - /* get file id for path name */ - int fid = unifyfs_get_fid_from_path(upath); - if (fid < 0) { - /* ERROR: file does not exist */ - LOGDBG("Couldn't find entry for %s in UNIFYFS", upath); - errno = ENOENT; - return -1; - } - - /* check that it's not a directory */ - if (unifyfs_fid_is_dir(fid)) { - /* TODO: shall be equivalent to rmdir(path) */ - /* ERROR: is a directory */ - LOGDBG("Attempting to remove a directory %s in UNIFYFS", upath); - errno = EISDIR; - return -1; - } - - /* shall be equivalent to unlink(path) */ - /* delete the file */ - int ret = unifyfs_fid_unlink(fid); + /* call shared logic function with 0 mask */ + int ret = unifyfs_remove(upath, 0); if (ret != UNIFYFS_SUCCESS) { - errno = unifyfs_rc_errno(ret); + LOGDBG("remove() failed on %s in UNIFYFS", upath); return -1; } diff --git a/common/src/unifyfs_const.h b/common/src/unifyfs_const.h index 392a051cc..b438c7899 100644 --- a/common/src/unifyfs_const.h +++ b/common/src/unifyfs_const.h @@ -77,7 +77,7 @@ // Log-based I/O #define UNIFYFS_LOGIO_CHUNK_SIZE (4 * MIB) #define UNIFYFS_LOGIO_SHMEM_SIZE (256 * MIB) -#define UNIFYFS_LOGIO_SPILL_SIZE (GIB) +#define UNIFYFS_LOGIO_SPILL_SIZE (4 * GIB) /* NOTE: max read size = UNIFYFS_MAX_SPLIT_CNT * META_DEFAULT_RANGE_SZ */ #define UNIFYFS_MAX_SPLIT_CNT (4 * KIB) diff --git a/docs/build.rst b/docs/build.rst index 709eab3c9..961e4d1b4 100644 --- a/docs/build.rst +++ b/docs/build.rst @@ -132,6 +132,9 @@ configure and build UnifyFS from its source code directory. $ spack load mochi-margo $ spack load spath $ + $ gotcha_install=$(spack location -i gotcha) + $ spath_install=$(spack location -i spath) + $ $ ./autogen.sh $ ./configure --prefix=/path/to/install --with-gotcha=${gotcha_install} --with-spath=${spath_install} $ make @@ -143,13 +146,6 @@ Alternatively, UnifyFS can be configured using ``CPPFLAGS`` and ``LDFLAGS``: $ ./configure --prefix=/path/to/install CPPFLAGS="-I${gotcha_install}/include -I{spath_install}/include" LDFLAGS="-L${gotcha_install}/lib64 -L${spath_install}/lib64 -.. admonition:: Spack package install location - - The location where Spack installs any given package can be retrieved by - running ``spack location -i ``. - - E.g.: ``gotcha_install=$(spack location -i gotcha)`` - To see all available build configuration options, run ``./configure --help`` after ``./autogen.sh`` has been run. diff --git a/docs/configuration.rst b/docs/configuration.rst index 56cc12aa2..5383729ed 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -108,7 +108,7 @@ files. =========== ====== ============================================================ chunk_size INT data chunk size (B) (default: 4 MiB) shmem_size INT maximum size (B) of data in shared memory (default: 256 MiB) - spill_size INT maximum size (B) of data in spillover file (default: 1 GiB) + spill_size INT maximum size (B) of data in spillover file (default: 4 GiB) spill_dir STRING path to spillover data directory =========== ====== ============================================================ diff --git a/docs/testing.rst b/docs/testing.rst index 18b55df82..bd7b5dae2 100644 --- a/docs/testing.rst +++ b/docs/testing.rst @@ -287,9 +287,9 @@ in (i.e., testing a wrapper that doesn't have any tests yet): Running the Tests ***************** -To manually run the UnifyFS unit test suite, simply run ``make check`` from the -inside the t/ directory of wherever you built UnifyFS. E.g., if you built in a -separate build/ directory, then do: +To manually run the UnifyFS unit test suite, simply run ``make check`` in a +single-node allocation from inside the t/ directory of wherever you built +UnifyFS. E.g., if you built in a separate build/ directory, then do: .. code-block:: BASH diff --git a/examples/src/checkpoint-restart.c b/examples/src/checkpoint-restart.c index 7099144fd..be908b6b7 100644 --- a/examples/src/checkpoint-restart.c +++ b/examples/src/checkpoint-restart.c @@ -233,8 +233,19 @@ int main(int argc, char* argv[]) } target_file = test_target_filename(cfg); - test_print_verbose_once(cfg, "DEBUG: creating target file %s", - target_file); + + // if reusing filename, remove old target file before starting timers + if (cfg->reuse_filename) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s for reuse", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + + test_print_verbose_once(cfg, + "DEBUG: creating target file %s", target_file); rc = test_create_file(cfg, target_file, O_RDWR); if (rc) { test_abort(cfg, rc); diff --git a/examples/src/testutil.h b/examples/src/testutil.h index 5ff920706..3bc6ec595 100644 --- a/examples/src/testutil.h +++ b/examples/src/testutil.h @@ -123,6 +123,7 @@ typedef struct { int enable_mpi_mount; /* automount during MPI_Init() */ char* output_file; /* print test messages to output file */ FILE* output_fp; + int reuse_filename; /* remove and then reuse filename from prior run*/ /* I/O behavior options */ int io_pattern; /* N1 or NN */ @@ -213,6 +214,7 @@ void test_config_print(test_cfg* cfg) fprintf(fp, "\t use_unifyfs = %d\n", cfg->use_unifyfs); fprintf(fp, "\t mpi_mount = %d\n", cfg->enable_mpi_mount); fprintf(fp, "\t outfile = %s\n", cfg->output_file); + fprintf(fp, "\t reuse_fname = %d\n", cfg->reuse_filename); fprintf(fp, "\n-- IO Behavior --\n"); fprintf(fp, "\t io_pattern = %s\n", io_pattern_str(cfg->io_pattern)); @@ -491,7 +493,7 @@ int test_is_static(const char* program) // common options for all tests -static const char* test_short_opts = "a:Ab:c:df:hkLm:MNn:o:p:PSt:T:UvVx"; +static const char* test_short_opts = "a:Ab:c:df:hkLm:MNn:o:p:PrSt:T:UvVx"; static const struct option test_long_opts[] = { { "appid", 1, 0, 'a' }, @@ -510,9 +512,10 @@ static const struct option test_long_opts[] = { { "outfile", 1, 0, 'o' }, { "pattern", 1, 0, 'p' }, { "prdwr", 0, 0, 'P' }, + { "reuse-filename", 0, 0, 'r' }, + { "stdio", 0, 0, 'S' }, { "pre-truncate", 1, 0, 't' }, { "post-truncate", 1, 0, 'T' }, - { "stdio", 0, 0, 'S' }, { "disable-unifyfs", 0, 0, 'U' }, { "verbose", 0, 0, 'v' }, { "vecio", 0, 0, 'V' }, @@ -555,6 +558,8 @@ static const char* test_usage_str = " (default: 'n1')\n" " -P, --prdwr use pread|pwrite instead of read|write\n" " (default: off)\n" + " -r, --reuse-filename remove and reuse the same target file name\n" + " (default: off)\n" " -S, --stdio use fread|fwrite instead of read|write\n" " (default: off)\n" " -t, --pre-truncate= truncate file to size (B) before writing\n" @@ -652,6 +657,10 @@ int test_process_argv(test_cfg* cfg, cfg->use_prdwr = 1; break; + case 'r': + cfg->reuse_filename = 1; + break; + case 'S': cfg->use_stdio = 1; break; @@ -1080,6 +1089,54 @@ int test_close_file(test_cfg* cfg) return 0; } +/* + * remove the given file if it exists + */ +static inline +int test_remove_file(test_cfg* cfg, const char* filepath) +{ + struct stat sb; + int rc; + + assert(NULL != cfg); + + /* stat file and simply return if it already doesn't exist */ + rc = stat(filepath, &sb); + if (rc) { + test_print_verbose_once(cfg, + "DEBUG: stat(%s): file already doesn't exist", filepath); + return 0; + } + + if (cfg->use_mpiio) { + MPI_CHECK(cfg, (MPI_File_delete(filepath, MPI_INFO_NULL))); + if (mpi_error) { + return -1; + } + return 0; + } + + /* POSIX I/O + * N-to-1 - rank 0 deletes shared files + * N-to-N - all ranks delete per-process files */ + if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { + if (cfg->use_stdio) { + rc = remove(filepath); + if (rc) { + test_print(cfg, "ERROR: remove(%s) failed", filepath); + return -1; + } + } else { + rc = unlink(filepath); + if (rc) { + test_print(cfg, "ERROR: unlink(%s) failed", filepath); + return -1; + } + } + } + return 0; +} + /* * create file at rank 0, open elsewhere */ diff --git a/examples/src/write.c b/examples/src/write.c index a3eb155f0..ca90e8e79 100644 --- a/examples/src/write.c +++ b/examples/src/write.c @@ -156,13 +156,24 @@ int main(int argc, char* argv[]) return -1; } + target_file = test_target_filename(cfg); + + // if reusing filename, remove old target file before starting timers + if (cfg->reuse_filename) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s for reuse", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + // timer to wrap all parts of write operation timer_start_barrier(cfg, &time_create2laminate); // create file - target_file = test_target_filename(cfg); - test_print_verbose_once(cfg, "DEBUG: creating target file %s", - target_file); + test_print_verbose_once(cfg, + "DEBUG: creating target file %s", target_file); timer_start_barrier(cfg, &time_create); rc = test_create_file(cfg, target_file, O_RDWR); if (rc) { diff --git a/examples/src/writeread.c b/examples/src/writeread.c index 369c2eb0d..5b8f7b2ec 100644 --- a/examples/src/writeread.c +++ b/examples/src/writeread.c @@ -218,8 +218,19 @@ int main(int argc, char* argv[]) return -1; } - // create file target_file = test_target_filename(cfg); + + // if reusing filename, remove old target file before starting timers + if (cfg->reuse_filename) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s for reuse", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + + // create file test_print_verbose_once(cfg, "DEBUG: creating target file %s", target_file); timer_start_barrier(cfg, &time_create); From 067e07416568eb1cda247dacbd2cd0edcbb4aea8 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 13 Jun 2021 07:53:02 -0700 Subject: [PATCH 14/81] Update semantics description in documentation --- docs/assumptions.rst | 101 ++++++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 25 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index c2af68b97..a181df835 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -1,52 +1,103 @@ ================ -Assumptions +Assumptions and Semantics ================ In this section, we provide assumptions we make about the behavior of -applications that use UnifyFS, and about how UnifyFS currently functions. +applications that use UnifyFS and about the file system semantics of UnifyFS. --------------------------- System Requirements --------------------------- -UnifyFS uses node-local storage devices, e.g., RAM and SSD, for storing the -application data. Therefore, a system should support the following requirements -to run UnifyFS. +The system requirements to run UnifyFS are: - - A compute node is equipped with a local storage device that UnifyFS can + - Compute nodes must be equipped with local storage device(s) that UnifyFS can use for storing file data, e.g., SSD or RAM. - - An ability for UnifyFS to launch user-level daemon processes on compute - nodes, which run concurrently with user application processes + - The system must support the ability for UnifyFS user-level daemon processes + to run concurrently with user application processes on compute nodes. --------------------------- Application Behavior --------------------------- -UnifyFS is specifically designed to support globally synchronous checkpointing -workloads. In such a workload, the expected application behavior is as follows. +UnifyFS is specifically designed to support the bulk synchronous I/O patterns +that are typical in HPC applications, e.g., checkpoint/restart or output dumps. +In bulk synchronous I/O, I/O operations occur in separate write and read phases, +and files are not read and written simultaneously. +For example, files are written during checkpointing (a write phase) +and read during recovery/restart (a read phase). +Additionally, parallel writes and reads to shared files occur systematically, +where processes access computable, regular offsets of files, e.g., in strided or +segmented access patterns, in contrast to random, interleaved, small writes and reads. +Note that a consequence of this assumption is that during a write phase, +if two procesess write concurrently to the +same file offset or to an overlapping region, the result is undefined and may +reflect the result of either processes' operation. + +UnifyFS offers the best performance for applications that exhibit the bulk +synchronous I/O pattern. While UnifyFS does support deviations to this pattern +(see Section XXXX), the performance might be slower and the user may +have to take additional steps to ensure correct execution of the application +with UnifyFS. +For example, during a write phase, a process can read any byte in +a file including remote data that has been written by processes in remote compute nodes. +However, the performance will differ based on which process wrote the data: + - If the bytes being read were written by the same process that wrote + the bytes, UnifyFS offers the fastest performance and no synchronization + operations are needed. This kind of access is typical in some I/O + libraries, e.g., HDF5, where file metadata may be updated and read by + the same process. + - If the bytes being read were written by a process on the same compute + node, UnifyFS can offer slightly slower performance and requires no + additional synchronization operations. + - If the bytes being read were written by a process on a different + compute node, then the performance is slower and the application must + introduce synchronization operations to ensure that the most recent + data is read. The synchronization can be achieved through adding + explicit "flush" operations in the application source code, + or by supplying the "write_sync" configuration parameter to UnifyFS + on startup, which will cause an implicit "flush" operation after + every write (note: the "write_sync" mode can significantly slow down + write performance.). See Section XXXX for more information. +In summary, reading the local data (which has been written by processes +executing on the same compute node) will always be faster than reading +remote data. - - I/O operations occur in separate write and read phases, and thus files are - not read and written simultaneously. For instance, files are only written - during the checkpointing (a write phase) and only read during the - recovery/restart (a read phase). - - During the read phase, a process can read any byte in a file including - remote data that has been written by processes in remote compute nodes. - However, reading the local data (which has been written by processes in - the same compute node) will be faster than reading the remote data. - - - During the write phase, the result of concurrently writing to the same - file offset by multiple processes is undefined. Similarly, multiple - processes writing to an overlapped region also leads to an undefined - result. For example, if a command in the job renames a file while the - parallel application is writing to it, the outcome is undefined, i.e., it - could be a success or failure depending on timing. --------------------------- Consistency Model --------------------------- +The UnifyFS file system does not support strict POSIX consistency semantics. +Instead, UnifyFS supports two different models: +*commit consistency semantics* when a file is actively +being modified; and *lamination semantics* when the file is no longer being +modified by the application. +These two consistency models provide opportunities for UnifyFS to +provide better performance for the I/O operatations of HPC applications. + +''''''''''''''''''''''''''' +Commit Consistency Semantics in UnifyFS +''''''''''''''''''''''''''' + +Commit consistency semantics rquire +explicit "commit" operations to be performed before updates to a file +are globally visible (Please see Chen et al., HPDC 2021 XXXX for more details +on different file system consistency semantics models.) +We chose commit consistency semantics for UnifyFS because it is sufficient +for correct execution of typical HPC applications that perform I/O +in a bulk synchronous pattern, and enables UnifyFS to provide better +performance. For example, because we assume that applications using UnifyFS +will not execute concurrent modifications to the same file offset, +UnifyFS does not have to employ expensive locking to ensure sequential +access to file regions. + +''''''''''''''''''''''''''' +Lamination Consistency Semantics in UnifyFS +''''''''''''''''''''''''''' + One key aspect of UnifyFS is the idea of "laminating" a file. After a file is laminated, it becomes "set in stone," and its data is accessible across all the nodes. Laminated files are permanently read-only and cannot be further modified, From f910cfdcd8234a0336bc00a5b589d3722a37bf25 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 13 Jun 2021 11:34:04 -0700 Subject: [PATCH 15/81] rough draft --- docs/assumptions.rst | 187 ++++++++++++++++++++++++++++--------------- 1 file changed, 121 insertions(+), 66 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index a181df835..c0b26c89f 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -21,8 +21,8 @@ The system requirements to run UnifyFS are: Application Behavior --------------------------- -UnifyFS is specifically designed to support the bulk synchronous I/O patterns -that are typical in HPC applications, e.g., checkpoint/restart or output dumps. +UnifyFS is specifically designed to support the bulk synchronous I/O pattern +that is typical in HPC applications, e.g., checkpoint/restart or output dumps. In bulk synchronous I/O, I/O operations occur in separate write and read phases, and files are not read and written simultaneously. For example, files are written during checkpointing (a write phase) @@ -30,40 +30,14 @@ and read during recovery/restart (a read phase). Additionally, parallel writes and reads to shared files occur systematically, where processes access computable, regular offsets of files, e.g., in strided or segmented access patterns, in contrast to random, interleaved, small writes and reads. -Note that a consequence of this assumption is that during a write phase, -if two procesess write concurrently to the -same file offset or to an overlapping region, the result is undefined and may -reflect the result of either processes' operation. UnifyFS offers the best performance for applications that exhibit the bulk -synchronous I/O pattern. While UnifyFS does support deviations to this pattern -(see Section XXXX), the performance might be slower and the user may +synchronous I/O pattern. While UnifyFS does support deviations to this pattern, +the performance might be slower and the user may have to take additional steps to ensure correct execution of the application with UnifyFS. -For example, during a write phase, a process can read any byte in -a file including remote data that has been written by processes in remote compute nodes. -However, the performance will differ based on which process wrote the data: - - If the bytes being read were written by the same process that wrote - the bytes, UnifyFS offers the fastest performance and no synchronization - operations are needed. This kind of access is typical in some I/O - libraries, e.g., HDF5, where file metadata may be updated and read by - the same process. - - If the bytes being read were written by a process on the same compute - node, UnifyFS can offer slightly slower performance and requires no - additional synchronization operations. - - If the bytes being read were written by a process on a different - compute node, then the performance is slower and the application must - introduce synchronization operations to ensure that the most recent - data is read. The synchronization can be achieved through adding - explicit "flush" operations in the application source code, - or by supplying the "write_sync" configuration parameter to UnifyFS - on startup, which will cause an implicit "flush" operation after - every write (note: the "write_sync" mode can significantly slow down - write performance.). See Section XXXX for more information. -In summary, reading the local data (which has been written by processes -executing on the same compute node) will always be faster than reading -remote data. - +For more information on this topic, refer to the section on +:ref:`commit consistency semantics in UnifyFS `_. --------------------------- @@ -71,44 +45,112 @@ Consistency Model --------------------------- The UnifyFS file system does not support strict POSIX consistency semantics. -Instead, UnifyFS supports two different models: +(Please see Chen et al., HPDC 2021 XXXX **link needed** for more details +on different file system consistency semantics models.) +Instead, UnifyFS supports two different consistency models: *commit consistency semantics* when a file is actively being modified; and *lamination semantics* when the file is no longer being modified by the application. These two consistency models provide opportunities for UnifyFS to -provide better performance for the I/O operatations of HPC applications. +provide better performance for the I/O operations of HPC applications. ''''''''''''''''''''''''''' Commit Consistency Semantics in UnifyFS ''''''''''''''''''''''''''' +.. _commit_consistency_label: -Commit consistency semantics rquire +Commit consistency semantics require explicit "commit" operations to be performed before updates to a file -are globally visible (Please see Chen et al., HPDC 2021 XXXX for more details -on different file system consistency semantics models.) +are globally visible We chose commit consistency semantics for UnifyFS because it is sufficient -for correct execution of typical HPC applications that perform I/O -in a bulk synchronous pattern, and enables UnifyFS to provide better -performance. For example, because we assume that applications using UnifyFS +for correct execution of typical HPC applications that adhere to +the bulk synchronous I/O pattern, and enables UnifyFS to provide better +performance than with strict POSIX semantics. For example, because +we assume that applications using UnifyFS will not execute concurrent modifications to the same file offset, -UnifyFS does not have to employ expensive locking to ensure sequential -access to file regions. +UnifyFS does not have to employ locking to ensure sequential +access to file regions. This assumption allows us to cache file +modifications locally which greatly improves the write performance +of UnifyFS. + +During a write phase, a process can deviate from the bulk synchronous +I/O pattern and read any byte in +a file, including remote data that has been written by processes +executing on remote compute nodes in the job. +However, the performance will differ based on which process wrote the data that +is being read: + - If the bytes being read were written by the same process that is reading + the bytes, UnifyFS offers the fastest performance and no synchronization + operations are needed. This kind of access is typical in some I/O + libraries, e.g., HDF5, where file metadata may be updated and read by + the same process. + - If the bytes being read were written by a process executing on the same compute + node as the reading process, UnifyFS can offer slightly slower performance + than the first case and requires no additional synchronization operations. + - If the bytes being read were written by a process executing on a different + compute node than the reading process, then the performance is slower + than the first two cases and the application must + introduce synchronization operations to ensure that the most recent + data is read. +In summary, reading the local data (which has been written by processes +executing on the same compute node) will always be faster than reading +remote data. + +Synchronization operations are required for applications that exhibit +I/O accesses that deviate from the bulk synchronous I/O pattern. +There are several methods by which applications can adhere to the synchronization +requirements. + - Using MPI-IO. The (MPI-IO_) interface requirements are a good match for the + consistency model of UnifyFS. Specifically, the MPI-IO interface requires + explicit synchronization in order for updates made by processes to + be globally visible. If an application utilizes the MPI-IO interface + correctly, it will adhere to the requirements of UnifyFS. + - Using (HDF5_) and other parallel I/O libraries. Most parallel I/O libraries + hide the synchronization requirements of file systems from their users. + For example, HDF5 implements the synchronization required by the MPI-IO + interface so users of HDF5 do not need to perform any synchronization + operations explicitly in their codes. + - With explicit synchronization. If an application does not use a compliant + parallel I/O library or if the developer wishes to perform explicit + synchronization, the synchronization can be achieved through adding + explicit "flush" operations in the application source code, + or by supplying the "write_sync" configuration parameter to UnifyFS + on startup, which will cause an implicit "flush" operation after + every write (note: the "write_sync" mode can significantly slow down + write performance.). **which operations are a flush?*** + +**How can one check if an application is properly synchronized??** + + +Note that commit semantics also require synchronization for potentially conflicting +write accesses. If an application does not enforce sequential ordering of file +modifications during a write phase, e.g., with MPI synchronization, +and multiple processes write concurrently to the same file offset or to an +overlapping region, the result is undefined and may +reflect the result of any of the processes' operations to that offset or region. +**I don't think this paragraph is true. I think we won't return the last write even if synchronization is applied** ''''''''''''''''''''''''''' Lamination Consistency Semantics in UnifyFS ''''''''''''''''''''''''''' One key aspect of UnifyFS is the idea of "laminating" a file. After a file is -laminated, it becomes "set in stone," and its data is accessible across all the -nodes. Laminated files are permanently read-only and cannot be further modified, -except for being renamed or deleted. If the application process group fails -before a file has been laminated, UnifyFS may delete the file. - -A typical use case is to laminate application checkpoint files after they have -been successfully written. To laminate a file, an application can simply call -chmod() to remove all the write bits, after its write phase is completed. When -write bits of a file are all canceled, UnifyFS will internally laminate the -file. A typical checkpoint will look like: +laminated, it becomes permanently read-only and its data is accessible across +all the compute nodes in the job. +Once a file is laminated, it cannot be further modified, +except for being renamed or deleted. +**If the application process group fails +before a file has been laminated, UnifyFS may delete the file.** +If a failure occurs during a job before a file is laminated, the file +contents may be unrecoverable. + +A typical use case for lamination is for checkpoint/restart. +An application can laminate checkpoint files after they have +been successfully written so that they can be read by any process on any compute +node in the job in a restart operation. To laminate a file, an application +can simply call chmod() to remove all the write bits, after its write phase +is completed. When write bits of a file are removed, UnifyFS will laminate the +file. A typical checkpoint write operation with UnifyFS will look like: .. code-block:: C @@ -117,18 +159,28 @@ file. A typical checkpoint will look like: close(fd) chmod("checkpoint1.chk", 0444) -Future versions of UnifyFS may support different laminate semantics, such as -laminate on close() or laminate via an explicit API call. +We plan for future versions of UnifyFS to support different methods for +laminating files, such as +laminating all files on close() or laminating via an explicit API call. We define the laminated consistency model to enable certain optimizations while -supporting the perceived requirements of application checkpoints. Since remote -processes are not permitted to read arbitrary bytes of a file until its -lamination, UnifyFS can buffer all data and metadata of the file locally -(instead of exchanging indexing information between compute nodes) before the -lamination occurs. Also, since file contents cannot change after lamination, +supporting the typical requirements of bulk synchronous I/O. +Recall that for bulk synchronous I/O patterns, reads and writes typically occur in +distinct phases. This means that for the majority of the time, +processes do not need to read arbitrary +bytes of a file until the write phase is completed, which in practice is +when the file is done being modified and closed and can be safely made +read-only with lamination. +By assuming that processes do not need to access file data modified +by other processes before lamination, +UnifyFS can optimize write performance by buffering all metadata and +file data for processes locally, instead of performing costly exchanges of +metadata and file data between compute nodes on every write. +Also, since file contents cannot change after lamination, aggressive caching may be used during the read phase with minimal locking. Further, since a file may be lost on application failure unless laminated, data redundancy schemes can be delayed until lamination. +**do we need to define our failure behavior better?** The following lists summarize available application I/O operations according to our consistency model. @@ -141,7 +193,7 @@ Behavior before lamination (write phase): to the same location, the value is undefined. - read: A process may read bytes it has written. Reading other bytes is - invalid. + invalid **without explicit synchronization operations.** - rename: A process may rename a file. @@ -173,26 +225,29 @@ The additional behavior of UnifyFS can be summarized as follows. persisted to stable storage like a parallel file system (PFS). When the data needs to be persisted to an external file system, users can use :ref:`unifyfs utility ` with its data staging - options. + options. **need to add API options** - UnifyFS also can be coupled with SymphonyFS_, high level I/O libraries, or a checkpoint library (VeloC_) to move data to PFS periodically. - - UnifyFS can be used with checkpointing libraries (VeloC_) or other I/O - libraries to support shared files on burst buffers. + - UnifyFS can be used with checkpointing libraries like (SCR_) or (VeloC_), + or with I/O libraries libraries like (HDF5_) to support shared files on burst buffers. - - UnifyFS starts empty at job start. User job must populate the file system + - The UnifyFS file system will be empty at job start. User job must populate the file system manually or by using :ref:`unifyfs utility `. + **need to add API options** - UnifyFS creates a shared file system namespace across all compute nodes in a job, even if an application process is not running on all compute nodes. - UnifyFS survives across multiple application runs within a job. - - UnifyFS will transparently intercept system level I/O calls of + - UnifyFS transparently intercepts system level I/O calls of applications and I/O libraries. .. _SymphonyFS: https://code.ornl.gov/techint/SymphonyFS .. _VeloC: https://github.com/ECP-VeloC/VELOC - +.. _SCR: https://github.com/llnl/scr +.. _HDF5: https://www.hdfgroup.org/ +.. _MPI-IO: https://www.mpi-forum.org/docs/ From 550f1a70a67e40eb70d40175679b2e350edbdcec Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 13 Jun 2021 11:39:32 -0700 Subject: [PATCH 16/81] move a section --- docs/assumptions.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index c0b26c89f..9a23c9aa5 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -182,6 +182,10 @@ Further, since a file may be lost on application failure unless laminated, data redundancy schemes can be delayed until lamination. **do we need to define our failure behavior better?** +--------------------------- +File System Behavior +--------------------------- + The following lists summarize available application I/O operations according to our consistency model. @@ -215,10 +219,6 @@ Behavior after lamination (read phase): - unlink: A process may delete a file. ---------------------------- -File System Behavior ---------------------------- - The additional behavior of UnifyFS can be summarized as follows. - UnifyFS exists on node local storage only and is not automatically From a0d9d0849923bfc5b0ea090a0a0a0e0b7d140475 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Mon, 14 Jun 2021 05:41:12 -0700 Subject: [PATCH 17/81] fix a few typos --- docs/assumptions.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 9a23c9aa5..b85252b14 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -61,7 +61,7 @@ Commit Consistency Semantics in UnifyFS Commit consistency semantics require explicit "commit" operations to be performed before updates to a file -are globally visible +are globally visible. We chose commit consistency semantics for UnifyFS because it is sufficient for correct execution of typical HPC applications that adhere to the bulk synchronous I/O pattern, and enables UnifyFS to provide better @@ -228,12 +228,12 @@ The additional behavior of UnifyFS can be summarized as follows. options. **need to add API options** - UnifyFS also can be coupled with SymphonyFS_, high level I/O libraries, or - a checkpoint library (VeloC_) to move data to PFS periodically. + a checkpoint library like (SCR_) or (VeloC_) to move data to the PFS periodically. - UnifyFS can be used with checkpointing libraries like (SCR_) or (VeloC_), or with I/O libraries libraries like (HDF5_) to support shared files on burst buffers. - - The UnifyFS file system will be empty at job start. User job must populate the file system + - The UnifyFS file system will be empty at job start. A user job must populate the file system manually or by using :ref:`unifyfs utility `. **need to add API options** @@ -241,7 +241,7 @@ The additional behavior of UnifyFS can be summarized as follows. - UnifyFS creates a shared file system namespace across all compute nodes in a job, even if an application process is not running on all compute nodes. - - UnifyFS survives across multiple application runs within a job. + - UnifyFS survives across multiple application runs within a job. **what if there is a failure???** - UnifyFS transparently intercepts system level I/O calls of applications and I/O libraries. From 045dcafa2390e000eb170013ccdaa4e6f0027aaf Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Mon, 14 Jun 2021 11:47:10 -0700 Subject: [PATCH 18/81] better draft --- docs/assumptions.rst | 78 ++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index b85252b14..50070814a 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -45,8 +45,9 @@ Consistency Model --------------------------- The UnifyFS file system does not support strict POSIX consistency semantics. -(Please see Chen et al., HPDC 2021 XXXX **link needed** for more details +(Please see Chen et al., HPDC 2021 for more details on different file system consistency semantics models.) +.. paper link needed when proceedings are out Instead, UnifyFS supports two different consistency models: *commit consistency semantics* when a file is actively being modified; and *lamination semantics* when the file is no longer being @@ -73,6 +74,31 @@ access to file regions. This assumption allows us to cache file modifications locally which greatly improves the write performance of UnifyFS. +The use of synchronization operations are required for applications that exhibit +I/O accesses that deviate from the bulk synchronous I/O pattern to perform +correctly with UnifyFS. +There are several methods by which applications can adhere to the synchronization +requirements. + - Using MPI-IO. The (MPI-IO_) interface requirements are a good match for the + consistency model of UnifyFS. Specifically, the MPI-IO interface requires + explicit synchronization in order for updates made by processes to + be globally visible. If an application utilizes the MPI-IO interface + correctly, it will adhere to the requirements of UnifyFS. + - Using (HDF5_) and other parallel I/O libraries. Most parallel I/O libraries + hide the synchronization requirements of file systems from their users. + For example, HDF5 implements the synchronization required by the MPI-IO + interface so users of HDF5 do not need to perform any synchronization + operations explicitly in their codes. + - With explicit synchronization. If an application does not use a compliant + parallel I/O library or if the developer wishes to perform explicit + synchronization, the synchronization can be achieved through adding + explicit "flush" operations with calls to fflush(), close(), or fsync() + in the application source code, + or by supplying the "write_sync" configuration parameter to UnifyFS + on startup, which will cause an implicit "flush" operation after + every write (note: the "write_sync" mode can significantly slow down + write performance.). + During a write phase, a process can deviate from the bulk synchronous I/O pattern and read any byte in a file, including remote data that has been written by processes @@ -96,32 +122,6 @@ In summary, reading the local data (which has been written by processes executing on the same compute node) will always be faster than reading remote data. -Synchronization operations are required for applications that exhibit -I/O accesses that deviate from the bulk synchronous I/O pattern. -There are several methods by which applications can adhere to the synchronization -requirements. - - Using MPI-IO. The (MPI-IO_) interface requirements are a good match for the - consistency model of UnifyFS. Specifically, the MPI-IO interface requires - explicit synchronization in order for updates made by processes to - be globally visible. If an application utilizes the MPI-IO interface - correctly, it will adhere to the requirements of UnifyFS. - - Using (HDF5_) and other parallel I/O libraries. Most parallel I/O libraries - hide the synchronization requirements of file systems from their users. - For example, HDF5 implements the synchronization required by the MPI-IO - interface so users of HDF5 do not need to perform any synchronization - operations explicitly in their codes. - - With explicit synchronization. If an application does not use a compliant - parallel I/O library or if the developer wishes to perform explicit - synchronization, the synchronization can be achieved through adding - explicit "flush" operations in the application source code, - or by supplying the "write_sync" configuration parameter to UnifyFS - on startup, which will cause an implicit "flush" operation after - every write (note: the "write_sync" mode can significantly slow down - write performance.). **which operations are a flush?*** - -**How can one check if an application is properly synchronized??** - - Note that commit semantics also require synchronization for potentially conflicting write accesses. If an application does not enforce sequential ordering of file modifications during a write phase, e.g., with MPI synchronization, @@ -130,17 +130,20 @@ overlapping region, the result is undefined and may reflect the result of any of the processes' operations to that offset or region. **I don't think this paragraph is true. I think we won't return the last write even if synchronization is applied** +.. How can users check that their application is correctly synchronized? Will we have the checker scripts ready? + ''''''''''''''''''''''''''' Lamination Consistency Semantics in UnifyFS ''''''''''''''''''''''''''' -One key aspect of UnifyFS is the idea of "laminating" a file. After a file is +The other consistency model that UnifyFS employs is called "lamination +semantics" which is intended to be applied once a file is done being modified +at the end of a write phase of an application. After a file is laminated, it becomes permanently read-only and its data is accessible across -all the compute nodes in the job. +all the compute nodes in the job without further synchronization. Once a file is laminated, it cannot be further modified, except for being renamed or deleted. -**If the application process group fails -before a file has been laminated, UnifyFS may delete the file.** +.. Is the next sentence true? Does more need to be added? If a failure occurs during a job before a file is laminated, the file contents may be unrecoverable. @@ -180,7 +183,7 @@ Also, since file contents cannot change after lamination, aggressive caching may be used during the read phase with minimal locking. Further, since a file may be lost on application failure unless laminated, data redundancy schemes can be delayed until lamination. -**do we need to define our failure behavior better?** +.. do we need to define our failure behavior better? --------------------------- File System Behavior @@ -197,7 +200,7 @@ Behavior before lamination (write phase): to the same location, the value is undefined. - read: A process may read bytes it has written. Reading other bytes is - invalid **without explicit synchronization operations.** + invalid without explicit synchronization operations. - rename: A process may rename a file. @@ -225,7 +228,7 @@ The additional behavior of UnifyFS can be summarized as follows. persisted to stable storage like a parallel file system (PFS). When the data needs to be persisted to an external file system, users can use :ref:`unifyfs utility ` with its data staging - options. **need to add API options** + options. - UnifyFS also can be coupled with SymphonyFS_, high level I/O libraries, or a checkpoint library like (SCR_) or (VeloC_) to move data to the PFS periodically. @@ -236,12 +239,15 @@ The additional behavior of UnifyFS can be summarized as follows. - The UnifyFS file system will be empty at job start. A user job must populate the file system manually or by using :ref:`unifyfs utility `. - **need to add API options** - UnifyFS creates a shared file system namespace across all compute nodes in a job, even if an application process is not running on all compute nodes. - - UnifyFS survives across multiple application runs within a job. **what if there is a failure???** + - UnifyFS survives across multiple application runs within a job. + + - If a failure occurs during a job before a file is laminated, the file + contents may be unrecoverable. +.. is this adequate to describe failure behavior? - UnifyFS transparently intercepts system level I/O calls of applications and I/O libraries. From f6509b0466cbfcc52d04c81daebf9f373e57c561 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 20 Jun 2021 08:28:39 -0700 Subject: [PATCH 19/81] typos and small clarifications --- docs/assumptions.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 50070814a..0ad02eaec 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -14,7 +14,7 @@ The system requirements to run UnifyFS are: - Compute nodes must be equipped with local storage device(s) that UnifyFS can use for storing file data, e.g., SSD or RAM. - - The system must support the ability for UnifyFS user-level daemon processes + - The system must support the ability for UnifyFS user-level server processes to run concurrently with user application processes on compute nodes. --------------------------- @@ -32,7 +32,7 @@ where processes access computable, regular offsets of files, e.g., in strided or segmented access patterns, in contrast to random, interleaved, small writes and reads. UnifyFS offers the best performance for applications that exhibit the bulk -synchronous I/O pattern. While UnifyFS does support deviations to this pattern, +synchronous I/O pattern. While UnifyFS does support deviations from this pattern, the performance might be slower and the user may have to take additional steps to ensure correct execution of the application with UnifyFS. @@ -94,9 +94,9 @@ requirements. synchronization, the synchronization can be achieved through adding explicit "flush" operations with calls to fflush(), close(), or fsync() in the application source code, - or by supplying the "write_sync" configuration parameter to UnifyFS + or by supplying the client.write_sync configuration parameter to UnifyFS on startup, which will cause an implicit "flush" operation after - every write (note: the "write_sync" mode can significantly slow down + every write (note: use of the client.write_sync mode can significantly slow down write performance.). During a write phase, a process can deviate from the bulk synchronous @@ -109,7 +109,8 @@ is being read: the bytes, UnifyFS offers the fastest performance and no synchronization operations are needed. This kind of access is typical in some I/O libraries, e.g., HDF5, where file metadata may be updated and read by - the same process. + the same process. (Note: to obtain the performance benefit for this case, + one must set the client.local_extents configuration parameter.) - If the bytes being read were written by a process executing on the same compute node as the reading process, UnifyFS can offer slightly slower performance than the first case and requires no additional synchronization operations. @@ -234,7 +235,7 @@ The additional behavior of UnifyFS can be summarized as follows. a checkpoint library like (SCR_) or (VeloC_) to move data to the PFS periodically. - UnifyFS can be used with checkpointing libraries like (SCR_) or (VeloC_), - or with I/O libraries libraries like (HDF5_) to support shared files on burst buffers. + or with I/O libraries like (HDF5_) to support shared files on burst buffers. - The UnifyFS file system will be empty at job start. A user job must populate the file system manually or by using From 21cba020c6848cd688daeff349a09c6cb5efb46f Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 20 Jun 2021 09:26:34 -0700 Subject: [PATCH 20/81] adding failure behavior section --- docs/assumptions.rst | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 0ad02eaec..758bbed5a 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -182,16 +182,27 @@ file data for processes locally, instead of performing costly exchanges of metadata and file data between compute nodes on every write. Also, since file contents cannot change after lamination, aggressive caching may be used during the read phase with minimal locking. -Further, since a file may be lost on application failure unless laminated, data -redundancy schemes can be delayed until lamination. -.. do we need to define our failure behavior better? --------------------------- File System Behavior --------------------------- -The following lists summarize available application I/O operations according to -our consistency model. +The following summarize the behavior of UnifyFS under our +consistency model. + +Failure behavior: + - In the event of a compute node failure, all file data from the processes running + on the failed compute node will be lost. + - In the event of the failure of a UnifyFS server process, all file data from + the processes assigned to that server process (typically on the same compute + node) will be lost. + - In the event of application process failures when the UnifyFS server + processes remain running, the file data can retrieved by the local + UnifyFS server or a remote UnifyFS server. + - The UnifyFS team plans to improve the reliability of UnifyFS in the event + of failures using redundancy scheme implementations available from + the (VeloC_) project as part of a future release. + Behavior before lamination (write phase): @@ -246,10 +257,6 @@ The additional behavior of UnifyFS can be summarized as follows. - UnifyFS survives across multiple application runs within a job. - - If a failure occurs during a job before a file is laminated, the file - contents may be unrecoverable. -.. is this adequate to describe failure behavior? - - UnifyFS transparently intercepts system level I/O calls of applications and I/O libraries. From ef0ec19ae303a6121b12b6773962135a35ae1c05 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 20 Jun 2021 09:30:21 -0700 Subject: [PATCH 21/81] formatting --- docs/assumptions.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 758bbed5a..715afe1dc 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -191,14 +191,18 @@ The following summarize the behavior of UnifyFS under our consistency model. Failure behavior: + - In the event of a compute node failure, all file data from the processes running on the failed compute node will be lost. + - In the event of the failure of a UnifyFS server process, all file data from the processes assigned to that server process (typically on the same compute node) will be lost. + - In the event of application process failures when the UnifyFS server processes remain running, the file data can retrieved by the local UnifyFS server or a remote UnifyFS server. + - The UnifyFS team plans to improve the reliability of UnifyFS in the event of failures using redundancy scheme implementations available from the (VeloC_) project as part of a future release. From 4f8ce33b82be6f8068b8bcf595ad3ed2f0c323ac Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sun, 20 Jun 2021 10:19:31 -0700 Subject: [PATCH 22/81] added discussion of local vs global synchronization --- docs/assumptions.rst | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 715afe1dc..2e0965b86 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -75,10 +75,27 @@ modifications locally which greatly improves the write performance of UnifyFS. The use of synchronization operations are required for applications that exhibit -I/O accesses that deviate from the bulk synchronous I/O pattern to perform -correctly with UnifyFS. +I/O accesses that deviate from the bulk synchronous I/O pattern. +There are two types of synchronization that are required for correct execution +of parallel I/O on UnifyFS: *local synchronization* and *global synchronization*. +Here, *local synchronization* refers to synchronization operations performed +locally by a process to ensure that its updates to a file are visible +to other processes. For example, a process may update a region of a file +and then execute fflush() so that a different process can read the updated +file contents. +*Global synchronization* refers to synchronization +operations that are performed to enforce ordering of conflicting I/O operations +from multiple processes. +These global synchronizations occur outside of normal file I/O operations and +typically involve interprocess communication, e.g., with MPI. For example, +if two processes need to update the same file region and it is important to +the outcome of the program that the updates occur in a particular order, then +the program needs to enforce this ordering with an operation like an MPI_Barrier() +to be sure that the first process has completed its updates before the next +process begins its updates. + There are several methods by which applications can adhere to the synchronization -requirements. +requirements of UnifyFS. - Using MPI-IO. The (MPI-IO_) interface requirements are a good match for the consistency model of UnifyFS. Specifically, the MPI-IO interface requires explicit synchronization in order for updates made by processes to @@ -91,13 +108,14 @@ requirements. operations explicitly in their codes. - With explicit synchronization. If an application does not use a compliant parallel I/O library or if the developer wishes to perform explicit - synchronization, the synchronization can be achieved through adding + synchronization, local synchronization can be achieved through adding explicit "flush" operations with calls to fflush(), close(), or fsync() in the application source code, or by supplying the client.write_sync configuration parameter to UnifyFS on startup, which will cause an implicit "flush" operation after every write (note: use of the client.write_sync mode can significantly slow down - write performance.). + write performance.). In this case, global synchronization is still required + for applications that perform conflicting updates to files. During a write phase, a process can deviate from the bulk synchronous I/O pattern and read any byte in @@ -113,7 +131,8 @@ is being read: one must set the client.local_extents configuration parameter.) - If the bytes being read were written by a process executing on the same compute node as the reading process, UnifyFS can offer slightly slower performance - than the first case and requires no additional synchronization operations. + than the first case and the application must introduce synchronization + operations to ensure that the most recent data is read. - If the bytes being read were written by a process executing on a different compute node than the reading process, then the performance is slower than the first two cases and the application must @@ -123,7 +142,8 @@ In summary, reading the local data (which has been written by processes executing on the same compute node) will always be faster than reading remote data. -Note that commit semantics also require synchronization for potentially conflicting +Note that, as we discuss above, commit semantics also require global synchronization +for potentially conflicting write accesses. If an application does not enforce sequential ordering of file modifications during a write phase, e.g., with MPI synchronization, and multiple processes write concurrently to the same file offset or to an From a69c96e669b9633cc295835deaa9b6becfb78ff4 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sat, 26 Jun 2021 09:50:48 -0700 Subject: [PATCH 23/81] typo and tweaking wording for "well-behaved" applications --- docs/assumptions.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 2e0965b86..d91d24c05 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -65,7 +65,7 @@ explicit "commit" operations to be performed before updates to a file are globally visible. We chose commit consistency semantics for UnifyFS because it is sufficient for correct execution of typical HPC applications that adhere to -the bulk synchronous I/O pattern, and enables UnifyFS to provide better +the bulk synchronous I/O pattern, and it enables UnifyFS to provide better performance than with strict POSIX semantics. For example, because we assume that applications using UnifyFS will not execute concurrent modifications to the same file offset, @@ -195,7 +195,7 @@ processes do not need to read arbitrary bytes of a file until the write phase is completed, which in practice is when the file is done being modified and closed and can be safely made read-only with lamination. -By assuming that processes do not need to access file data modified +For applications in which processes do not need to access file data modified by other processes before lamination, UnifyFS can optimize write performance by buffering all metadata and file data for processes locally, instead of performing costly exchanges of From c33f93c2d8442e48d676c54050ec4ac311027f14 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sat, 26 Jun 2021 10:31:35 -0700 Subject: [PATCH 24/81] global synchronization -> inter-process synchronization --- docs/assumptions.rst | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index d91d24c05..ee186cab0 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -77,17 +77,17 @@ of UnifyFS. The use of synchronization operations are required for applications that exhibit I/O accesses that deviate from the bulk synchronous I/O pattern. There are two types of synchronization that are required for correct execution -of parallel I/O on UnifyFS: *local synchronization* and *global synchronization*. +of parallel I/O on UnifyFS: *local synchronization* and *inter-process synchronization*. Here, *local synchronization* refers to synchronization operations performed locally by a process to ensure that its updates to a file are visible to other processes. For example, a process may update a region of a file and then execute fflush() so that a different process can read the updated file contents. -*Global synchronization* refers to synchronization +*Inter-process synchronization* refers to synchronization operations that are performed to enforce ordering of conflicting I/O operations from multiple processes. -These global synchronizations occur outside of normal file I/O operations and -typically involve interprocess communication, e.g., with MPI. For example, +These inter-process synchronizations occur outside of normal file I/O operations and +typically involve inter-process communication, e.g., with MPI. For example, if two processes need to update the same file region and it is important to the outcome of the program that the updates occur in a particular order, then the program needs to enforce this ordering with an operation like an MPI_Barrier() @@ -114,7 +114,7 @@ requirements of UnifyFS. or by supplying the client.write_sync configuration parameter to UnifyFS on startup, which will cause an implicit "flush" operation after every write (note: use of the client.write_sync mode can significantly slow down - write performance.). In this case, global synchronization is still required + write performance.). In this case, inter-process synchronization is still required for applications that perform conflicting updates to files. During a write phase, a process can deviate from the bulk synchronous @@ -142,7 +142,7 @@ In summary, reading the local data (which has been written by processes executing on the same compute node) will always be faster than reading remote data. -Note that, as we discuss above, commit semantics also require global synchronization +Note that, as we discuss above, commit semantics also require inter-process synchronization for potentially conflicting write accesses. If an application does not enforce sequential ordering of file modifications during a write phase, e.g., with MPI synchronization, @@ -164,9 +164,6 @@ laminated, it becomes permanently read-only and its data is accessible across all the compute nodes in the job without further synchronization. Once a file is laminated, it cannot be further modified, except for being renamed or deleted. -.. Is the next sentence true? Does more need to be added? -If a failure occurs during a job before a file is laminated, the file -contents may be unrecoverable. A typical use case for lamination is for checkpoint/restart. An application can laminate checkpoint files after they have From ab5552391de33048b5decd2038afd863fd7a5bda Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sat, 26 Jun 2021 10:43:32 -0700 Subject: [PATCH 25/81] adding text describing the issue of overlapping writes --- docs/assumptions.rst | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index ee186cab0..cb8dab312 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -29,7 +29,10 @@ For example, files are written during checkpointing (a write phase) and read during recovery/restart (a read phase). Additionally, parallel writes and reads to shared files occur systematically, where processes access computable, regular offsets of files, e.g., in strided or -segmented access patterns, in contrast to random, interleaved, small writes and reads. +segmented access patterns, with ordering of potential conflicting updates +enforced by inter-process communication. +This behavior is in contrast to other I/O patterns that may perform +random, small writes and reads or overlapping writes without synchronization. UnifyFS offers the best performance for applications that exhibit the bulk synchronous I/O pattern. While UnifyFS does support deviations from this pattern, @@ -148,8 +151,7 @@ write accesses. If an application does not enforce sequential ordering of file modifications during a write phase, e.g., with MPI synchronization, and multiple processes write concurrently to the same file offset or to an overlapping region, the result is undefined and may -reflect the result of any of the processes' operations to that offset or region. -**I don't think this paragraph is true. I think we won't return the last write even if synchronization is applied** +reflect the result of a mixture of the processes' operations to that offset or region. .. How can users check that their application is correctly synchronized? Will we have the checker scripts ready? @@ -230,7 +232,8 @@ Behavior before lamination (write phase): - open/close: A process may open/close a file multiple times. - write: A process may write to any part of a file. If two processes write - to the same location, the value is undefined. + to the same location concurrently (i.e., without inter-process + synchronization to enforce ordering), the result is undefined. - read: A process may read bytes it has written. Reading other bytes is invalid without explicit synchronization operations. From 36fa254f7a11d4b42ae0c4e5ace8baf7275c3b6c Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sat, 26 Jun 2021 10:44:57 -0700 Subject: [PATCH 26/81] typo --- docs/assumptions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index cb8dab312..1f5a467d6 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -117,7 +117,7 @@ requirements of UnifyFS. or by supplying the client.write_sync configuration parameter to UnifyFS on startup, which will cause an implicit "flush" operation after every write (note: use of the client.write_sync mode can significantly slow down - write performance.). In this case, inter-process synchronization is still required + write performance). In this case, inter-process synchronization is still required for applications that perform conflicting updates to files. During a write phase, a process can deviate from the bulk synchronous From 71cbe42504d23dfaedbb349f009ec01232433ba5 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sat, 26 Jun 2021 11:05:06 -0700 Subject: [PATCH 27/81] fixing/adding links --- docs/assumptions.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 1f5a467d6..e1e6795c1 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -48,9 +48,9 @@ Consistency Model --------------------------- The UnifyFS file system does not support strict POSIX consistency semantics. -(Please see Chen et al., HPDC 2021 for more details -on different file system consistency semantics models.) -.. paper link needed when proceedings are out +(Please see +`Chen et al., HPDC 2021 `_ +for more details on different file system consistency semantics models.) Instead, UnifyFS supports two different consistency models: *commit consistency semantics* when a file is actively being modified; and *lamination semantics* when the file is no longer being @@ -267,10 +267,10 @@ The additional behavior of UnifyFS can be summarized as follows. options. - UnifyFS also can be coupled with SymphonyFS_, high level I/O libraries, or - a checkpoint library like (SCR_) or (VeloC_) to move data to the PFS periodically. + a checkpoint library like SCR_ or VeloC_ to move data to the PFS periodically. - - UnifyFS can be used with checkpointing libraries like (SCR_) or (VeloC_), - or with I/O libraries like (HDF5_) to support shared files on burst buffers. + - UnifyFS can be used with checkpointing libraries like SCR_ or VeloC_, + or with I/O libraries like HDF5_ to support shared files on burst buffers. - The UnifyFS file system will be empty at job start. A user job must populate the file system manually or by using From a22c5463f522f26093ebcc6c0e45c7a3d7791eb9 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Sat, 26 Jun 2021 11:06:21 -0700 Subject: [PATCH 28/81] fix another link --- docs/assumptions.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/assumptions.rst b/docs/assumptions.rst index e1e6795c1..10eea9f76 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -224,7 +224,7 @@ Failure behavior: - The UnifyFS team plans to improve the reliability of UnifyFS in the event of failures using redundancy scheme implementations available from - the (VeloC_) project as part of a future release. + the VeloC_ project as part of a future release. Behavior before lamination (write phase): From 1512a9f7679d6dcfbe645b267a839d528e95e12f Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Thu, 15 Jul 2021 15:44:10 -0400 Subject: [PATCH 29/81] cleaner separation of posix client code and library api client changes: * remove unused app_id parameter to unifyfs_mount() * Makefile updates to clearly indicate source files corresponding to core functionality, library API, and POSIX I/O support * adds unifyfs_client state structure to avoid use of global variables * move implementation of client API functions to client_api.c * move wrapper macros to unifyfs_wrap.h * introduces posix_client.[ch] to manage global state for POSIX I/O * add unifyfs_fid.[ch] to contain all 'unifyfs_fid_xxx' functions * PMPI wrapper for unifyfs_mount() now checks for and uses environment variable setting for UNIFYFS_MOUNTPOINT * library API transfer support now uses server-based transfers * adds checks in library API to ensure created/accessed files are within client namespace common changes: * arraylist_add() now returns position of appended item * remove MDHIM-specific configuration settings * remove unused constants from unifyfs_const.h server changes: * implements support for parallel transfers where each server with local file data copies it to the destination file tests/examples changes: * use updated client and library APIs * new library API transfer test * add library API support to examples testutil * new write-transfer example * fix sharness.sh test_dir_is_empty() to ignore trailing slashes TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.h" TEST_CHECKPATCH_SKIP_FILES+=",client/src/unifyfs_wrap.h" TEST_CHECKPATCH_SKIP_FILES+=",examples/src/testutil.h" --- Makefile.am | 2 +- client/src/Makefile.am | 44 +- client/src/client_api.c | 93 + client/src/client_read.c | 94 +- client/src/client_read.h | 48 +- client/src/client_transfer.c | 512 +++-- client/src/client_transfer.h | 57 +- client/src/margo_client.c | 296 ++- client/src/margo_client.h | 54 +- client/src/pmpi_wrappers.c | 14 +- client/src/posix_client.c | 917 ++++++++ client/src/posix_client.h | 206 ++ client/src/unifyfs-dirops.c | 28 +- client/src/unifyfs-dirops.h | 7 +- client/src/unifyfs-fixed.c | 332 --- client/src/unifyfs-fixed.h | 67 - client/src/unifyfs-internal.h | 461 ---- client/src/unifyfs-stdio.c | 67 +- client/src/unifyfs-stdio.h | 1 + client/src/unifyfs-sysio.c | 267 ++- client/src/unifyfs-sysio.h | 5 +- client/src/unifyfs.c | 2086 +++---------------- client/src/unifyfs.h | 44 +- client/src/unifyfs_api.c | 169 +- client/src/unifyfs_api.h | 2 + client/src/unifyfs_api_file.c | 97 +- client/src/unifyfs_api_internal.h | 109 +- client/src/unifyfs_api_io.c | 33 +- client/src/unifyfs_api_transfer.c | 93 +- client/src/unifyfs_fid.c | 1145 ++++++++++ client/src/unifyfs_fid.h | 152 ++ client/src/unifyfs_wrap.h | 115 + client/src/unifyfsf.c | 8 +- common/src/Makefile.mk | 1 + common/src/arraylist.c | 34 +- common/src/unifyfs_client.h | 50 + common/src/unifyfs_client_rpcs.h | 34 + common/src/unifyfs_configurator.h | 7 +- common/src/unifyfs_const.h | 59 +- common/src/unifyfs_logio.c | 8 +- common/src/unifyfs_meta.c | 2 +- common/src/unifyfs_meta.h | 10 +- common/src/unifyfs_rc.h | 2 +- common/src/unifyfs_server_rpcs.h | 38 +- docs/api.rst | 5 +- docs/library_api.rst | 3 +- examples/src/Makefile.am | 15 +- examples/src/app-btio.c | 7 +- examples/src/app-hdf5-create.c | 2 +- examples/src/app-hdf5-writeread.c | 2 +- examples/src/app-mpiio.c | 2 +- examples/src/app-tileio.c | 7 +- examples/src/read-data.c | 2 +- examples/src/simul.c | 2 +- examples/src/testutil.c | 10 +- examples/src/testutil.h | 239 ++- examples/src/testutil_rdwr.h | 236 ++- examples/src/transfer.c | 87 +- examples/src/write-transfer.c | 444 ++++ examples/src/writeread.c | 3 + server/src/Makefile.am | 2 + server/src/extent_tree.h | 6 + server/src/margo_server.c | 82 +- server/src/margo_server.h | 15 +- server/src/unifyfs_client_rpc.c | 64 +- server/src/unifyfs_fops.h | 122 +- server/src/unifyfs_fops_mdhim.c | 23 +- server/src/unifyfs_fops_rpc.c | 87 +- server/src/unifyfs_global.h | 13 +- server/src/unifyfs_group_rpc.c | 150 +- server/src/unifyfs_group_rpc.h | 14 + server/src/unifyfs_inode.c | 2 +- server/src/unifyfs_metadata_mdhim.c | 12 +- server/src/unifyfs_metadata_mdhim.h | 13 + server/src/unifyfs_p2p_rpc.c | 117 ++ server/src/unifyfs_p2p_rpc.h | 19 + server/src/unifyfs_request_manager.c | 61 +- server/src/unifyfs_request_manager.h | 5 +- server/src/unifyfs_server.c | 61 +- server/src/unifyfs_server_pid.c | 2 +- server/src/unifyfs_service_manager.c | 326 ++- server/src/unifyfs_service_manager.h | 25 +- server/src/unifyfs_transfer.c | 313 +++ server/src/unifyfs_transfer.h | 65 + t/0700-unifyfs-stage-full.t | 4 +- t/{8000-client-api.t => 8000-library-api.t} | 2 +- t/9020-mountpoint-empty.t | 2 +- t/9300-unifyfs-stage-isolated.t | 4 +- t/9999-cleanup.t | 2 +- t/Makefile.am | 20 +- t/api/{client_api_suite.c => api_suite.c} | 10 +- t/api/{client_api_suite.h => api_suite.h} | 17 +- t/api/create-open-remove.c | 13 +- t/api/init-fini.c | 2 +- t/api/laminate.c | 2 +- t/api/transfer.c | 302 +++ t/api/write-read-sync-stat.c | 2 +- t/lib/testutil.c | 35 +- t/lib/testutil.h | 12 +- t/sharness.d/02-functions.sh | 15 +- t/sharness.sh | 2 +- t/std/stdio_suite.c | 2 +- t/sys/statfs_suite.c | 2 +- t/sys/sysio_suite.c | 2 +- t/unifyfs_unmount.c | 2 +- util/unifyfs-stage/src/unifyfs-stage.c | 2 +- util/unifyfs/src/unifyfs-rm.c | 4 +- 107 files changed, 7054 insertions(+), 3941 deletions(-) create mode 100644 client/src/client_api.c create mode 100644 client/src/posix_client.c create mode 100644 client/src/posix_client.h delete mode 100644 client/src/unifyfs-fixed.c delete mode 100644 client/src/unifyfs-fixed.h create mode 100644 client/src/unifyfs_fid.c create mode 100644 client/src/unifyfs_fid.h create mode 100644 client/src/unifyfs_wrap.h create mode 100644 common/src/unifyfs_client.h create mode 100644 examples/src/write-transfer.c create mode 100644 server/src/unifyfs_transfer.c create mode 100644 server/src/unifyfs_transfer.h rename t/{8000-client-api.t => 8000-library-api.t} (77%) rename t/api/{client_api_suite.c => api_suite.c} (87%) rename t/api/{client_api_suite.h => api_suite.h} (79%) create mode 100644 t/api/transfer.c diff --git a/Makefile.am b/Makefile.am index 56b15a293..f8cf18283 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = meta server client examples extras t util +SUBDIRS = extras meta client server util examples t CONFIG = ordered diff --git a/client/src/Makefile.am b/client/src/Makefile.am index a40b2b180..fa46fdf7f 100644 --- a/client/src/Makefile.am +++ b/client/src/Makefile.am @@ -56,7 +56,7 @@ CLIENT_COMMON_CFLAGS += $(SPATH_CFLAGS) CLIENT_COMMON_LIBADD += $(SPATH_LIBS) endif -CLIENT_API_SRC_FILES = \ +LIBRARY_API_SRC_FILES = \ unifyfs_api.h \ unifyfs_api_internal.h \ unifyfs_api.c \ @@ -65,8 +65,8 @@ CLIENT_API_SRC_FILES = \ unifyfs_api_transfer.c CLIENT_CORE_SRC_FILES = \ - $(OPT_SRCS) \ $(UNIFYFS_COMMON_SRCS) \ + client_api.c \ client_read.c \ client_read.h \ client_transfer.c \ @@ -75,19 +75,22 @@ CLIENT_CORE_SRC_FILES = \ margo_client.h \ unifyfs.c \ unifyfs.h \ - unifyfs-fixed.c \ - unifyfs-fixed.h \ + unifyfs_fid.c \ + unifyfs_fid.h \ unifyfs-internal.h \ uthash.h \ utlist.h -CLIENT_POSIX_SRC_FILES = \ +POSIX_CLIENT_SRC_FILES = \ + posix_client.c \ + posix_client.h \ unifyfs-dirops.c \ unifyfs-dirops.h \ unifyfs-stdio.c \ unifyfs-stdio.h \ unifyfs-sysio.c \ - unifyfs-sysio.h + unifyfs-sysio.h \ + unifyfs_wrap.h PMPI_SRC_FILES = \ pmpi_wrappers.c \ @@ -96,56 +99,57 @@ PMPI_SRC_FILES = \ # Per-target flags begin here -libunifyfs_api_la_SOURCES = \ - $(CLIENT_API_SRC_FILES) \ - $(CLIENT_CORE_SRC_FILES) \ - $(CLIENT_POSIX_SRC_FILES) libunifyfs_api_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) libunifyfs_api_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfs_api_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) libunifyfs_api_la_LIBADD = $(CLIENT_COMMON_LIBADD) - -libunifyfs_la_SOURCES = \ +libunifyfs_api_la_SOURCES = \ $(CLIENT_CORE_SRC_FILES) \ - $(CLIENT_POSIX_SRC_FILES) + $(LIBRARY_API_SRC_FILES) + libunifyfs_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) libunifyfs_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfs_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) libunifyfs_la_LIBADD = $(CLIENT_COMMON_LIBADD) +libunifyfs_la_SOURCES = \ + $(CLIENT_CORE_SRC_FILES) \ + $(LIBRARY_API_SRC_FILES) \ + $(POSIX_CLIENT_SRC_FILES) if USE_PMPI_WRAPPERS -libunifyfs_mpi_la_SOURCES = $(PMPI_SRC_FILES) libunifyfs_mpi_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) libunifyfs_mpi_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfs_mpi_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) libunifyfs_mpi_la_LIBADD = libunifyfs.la +libunifyfs_mpi_la_SOURCES = $(PMPI_SRC_FILES) endif #USE_PMPI_WRAPPERS if HAVE_GOTCHA -libunifyfs_gotcha_la_SOURCES = \ - $(CLIENT_CORE_SRC_FILES) \ - $(CLIENT_POSIX_SRC_FILES) \ - gotcha_map_unifyfs_list.c libunifyfs_gotcha_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) -DUNIFYFS_GOTCHA libunifyfs_gotcha_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) $(GOTCHA_CFLAGS) libunifyfs_gotcha_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) libunifyfs_gotcha_la_LIBADD = $(CLIENT_COMMON_LIBADD) $(GOTCHA_LIBS) +libunifyfs_gotcha_la_SOURCES = \ + $(CLIENT_CORE_SRC_FILES) \ + $(LIBRARY_API_SRC_FILES) \ + $(POSIX_CLIENT_SRC_FILES) \ + gotcha_map_unifyfs_list.c if USE_PMPI_WRAPPERS -libunifyfs_mpi_gotcha_la_SOURCES = $(PMPI_SRC_FILES) libunifyfs_mpi_gotcha_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) libunifyfs_mpi_gotcha_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfs_mpi_gotcha_la_LDFLAGS = $(CLIENT_COMMON_LDFLAGS) libunifyfs_mpi_gotcha_la_LIBADD = libunifyfs_gotcha.la +libunifyfs_mpi_gotcha_la_SOURCES = $(PMPI_SRC_FILES) endif #USE_PMPI_WRAPPERS if HAVE_FORTRAN -libunifyfsf_la_SOURCES = unifyfsf.c libunifyfsf_la_CPPFLAGS = $(CLIENT_COMMON_CPPFLAGS) libunifyfsf_la_CFLAGS = $(CLIENT_COMMON_CFLAGS) libunifyfsf_la_LIBADD = libunifyfs_gotcha.la +libunifyfsf_la_SOURCES = unifyfsf.c endif #HAVE_FORTRAN endif #HAVE_GOTCHA diff --git a/client/src/client_api.c b/client/src/client_api.c new file mode 100644 index 000000000..ecc75b0e3 --- /dev/null +++ b/client/src/client_api.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs.h" +#include "unifyfs_api_internal.h" +#include "posix_client.h" + + +/* ======================================= + * Global variable declarations + * ======================================= */ + +char* unifyfs_mount_prefix; +size_t unifyfs_mount_prefixlen; + + +/* ======================================= + * Operations to mount/unmount file system + * ======================================= */ + +/** + * Mount UnifyFS file system at given prefix + * + * @param prefix: directory prefix + * @param rank: client rank within application + * @param size: the number of application clients + * + * @return success/error code + */ +int unifyfs_mount(const char prefix[], + int rank, + size_t size) +{ + // generate app_id from mountpoint prefix + int app_id = unifyfs_generate_gfid(prefix); + if (-1 != unifyfs_mount_id) { + if (app_id != unifyfs_mount_id) { + LOGERR("multiple mount support not yet implemented"); + return UNIFYFS_FAILURE; + } else { + LOGDBG("already mounted"); + return UNIFYFS_SUCCESS; + } + } + + // record our rank for debugging messages + client_rank = rank; + global_rank_cnt = (int)size; + + // record mountpoint prefix string + unifyfs_mount_prefix = strdup(prefix); + unifyfs_mount_prefixlen = strlen(prefix); + + // initialize as UnifyFS POSIX client + return posix_client_init(); +} + +/** + * Unmount the mounted UnifyFS file system + * + * @return success/error code + */ +int unifyfs_unmount(void) +{ + if (-1 == unifyfs_mount_id) { + return UNIFYFS_SUCCESS; + } + + int rc = posix_client_fini(); + if (UNIFYFS_SUCCESS != rc) { + return rc; + } + + /* free mount prefix string */ + if (unifyfs_mount_prefix != NULL) { + free(unifyfs_mount_prefix); + unifyfs_mount_prefix = NULL; + unifyfs_mount_prefixlen = 0; + } + + return UNIFYFS_SUCCESS; +} diff --git a/client/src/client_read.c b/client/src/client_read.c index 476ba9fd0..7824b2d55 100644 --- a/client/src/client_read.c +++ b/client/src/client_read.c @@ -26,33 +26,32 @@ static void debug_print_read_req(read_req_t* req) } } -/* an arraylist to maintain the active mread requests for the client */ -arraylist_t* active_mreads; // = NULL -/* use to generated unique ids for each new mread */ -static unsigned int id_generator; // = 0 /* compute the arraylist index for the given request id. we use * modulo operator to reuse slots in the list */ static inline -unsigned int id_to_list_index(unsigned int id) +unsigned int id_to_list_index(unifyfs_client* client, + unsigned int id) { - unsigned int capacity = (unsigned int) arraylist_capacity(active_mreads); + unsigned int capacity = (unsigned int) + arraylist_capacity(client->active_mreads); return id % capacity; } /* Create a new mread request containing the n_reads requests provided * in read_reqs array */ -client_mread_status* client_create_mread_request(int n_reads, +client_mread_status* client_create_mread_request(unifyfs_client* client, + int n_reads, read_req_t* read_reqs) { - if (NULL == active_mreads) { - LOGERR("active_mreads is NULL"); + if ((NULL == client) || (NULL == client->active_mreads)) { + LOGERR("client->active_mreads is NULL"); return NULL; } - int active_count = arraylist_size(active_mreads); - if (active_count == arraylist_capacity(active_mreads)) { + int active_count = arraylist_size(client->active_mreads); + if (active_count == arraylist_capacity(client->active_mreads)) { /* already at full capacity for outstanding reads */ LOGWARN("too many outstanding client reads"); return NULL; @@ -62,9 +61,9 @@ client_mread_status* client_create_mread_request(int n_reads, unsigned int mread_id, req_ndx; void* existing; do { - mread_id = id_generator++; - req_ndx = id_to_list_index(mread_id); - existing = arraylist_get(active_mreads, req_ndx); + mread_id = client->mread_id_generator++; + req_ndx = id_to_list_index(client, mread_id); + existing = arraylist_get(client->active_mreads, req_ndx); } while (existing != NULL); client_mread_status* mread = calloc(1, sizeof(client_mread_status)); @@ -72,55 +71,59 @@ client_mread_status* client_create_mread_request(int n_reads, LOGERR("failed to allocate client mread status"); return NULL; } + mread->client = client; + mread->id = mread_id; + mread->reqs = read_reqs; + mread->n_reads = (unsigned int) n_reads; + ABT_mutex_create(&(mread->sync)); - int rc = arraylist_insert(active_mreads, (int)req_ndx, (void*)mread); + int rc = arraylist_insert(client->active_mreads, + (int)req_ndx, (void*)mread); if (rc != 0) { free(mread); return NULL; } - mread->id = mread_id; - mread->reqs = read_reqs; - mread->n_reads = (unsigned int) n_reads; - ABT_mutex_create(&(mread->sync)); - return mread; } /* Remove the mread status */ int client_remove_mread_request(client_mread_status* mread) { - if (NULL == active_mreads) { - LOGERR("active_mreads is NULL"); - return EINVAL; - } if (NULL == mread) { LOGERR("mread is NULL"); return EINVAL; } - int list_index = (int) id_to_list_index(mread->id); - void* list_item = arraylist_remove(active_mreads, list_index); + unifyfs_client* client = mread->client; + if (NULL == client->active_mreads) { + LOGERR("client->active_mreads is NULL"); + return EINVAL; + } + + int list_index = (int) id_to_list_index(client, mread->id); + void* list_item = arraylist_remove(client->active_mreads, list_index); if (list_item == (void*)mread) { ABT_mutex_free(&(mread->sync)); free(mread); return UNIFYFS_SUCCESS; } else { - LOGERR("mismatch on active_mreads index=%d", list_index); + LOGERR("mismatch on client->active_mreads index=%d", list_index); return UNIFYFS_FAILURE; } } /* Retrieve the mread request corresponding to the given mread_id. */ -client_mread_status* client_get_mread_status(unsigned int mread_id) +client_mread_status* client_get_mread_status(unifyfs_client* client, + unsigned int mread_id) { - if (NULL == active_mreads) { - LOGERR("active_mreads is NULL"); + if ((NULL == client) || (NULL == client->active_mreads)) { + LOGERR("client->active_mreads is NULL"); return NULL; } - int list_index = (int) id_to_list_index(mread_id); - void* list_item = arraylist_get(active_mreads, list_index); + int list_index = (int) id_to_list_index(client, mread_id); + void* list_item = arraylist_get(client->active_mreads, list_index); client_mread_status* status = (client_mread_status*)list_item; if (NULL != status) { if (status->id != mread_id) { @@ -277,6 +280,7 @@ void update_read_req_coverage(read_req_t* req, * requests to be handled by the server. */ static void service_local_reqs( + unifyfs_client* client, read_req_t* read_reqs, /* list of input read requests */ int count, /* number of input read requests */ read_req_t* local_reqs, /* output list of requests completed by client */ @@ -298,7 +302,7 @@ void service_local_reqs( int gfid = req->gfid; /* lookup local extents if we have them */ - int fid = unifyfs_fid_from_gfid(gfid); + int fid = unifyfs_fid_from_gfid(client, gfid); /* move to next request if we can't find the matching fid */ if (fid < 0) { @@ -314,7 +318,7 @@ void service_local_reqs( size_t req_end = req->offset + req->length; /* get pointer to extents for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); assert(meta != NULL); struct seg_tree* extents = &meta->extents; @@ -398,8 +402,8 @@ void service_local_reqs( /* copy data from local write log into user buffer */ off_t log_offset = ext_log_pos + ext_byte_offset; size_t nread = 0; - int rc = unifyfs_logio_read(logio_ctx, log_offset, cover_length, - req_ptr, &nread); + int rc = unifyfs_logio_read(client->state.logio_ctx, log_offset, + cover_length, req_ptr, &nread); if (rc == UNIFYFS_SUCCESS) { /* update bytes we have filled in the request buffer */ update_read_req_coverage(req, req_byte_offset, nread); @@ -460,7 +464,9 @@ int compare_read_req(const void* a, const void* b) * * @return error code */ -int process_gfid_reads(read_req_t* in_reqs, int in_count) +int process_gfid_reads(unifyfs_client* client, + read_req_t* in_reqs, + int in_count) { if (0 == in_count) { return UNIFYFS_SUCCESS; @@ -498,7 +504,7 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) read_req_t* reqs = NULL; /* attempt to complete requests locally if enabled */ - if (unifyfs_local_extents) { + if (client->use_local_extents) { /* allocate space to make local and server copies of the requests, * each list will be at most in_count long */ size_t reqs_size = 2 * in_count; @@ -517,7 +523,7 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) * completed requests from in_reqs into local_reqs, and it copies * any requests that can't be completed locally into the server_reqs * to be processed by the server */ - service_local_reqs(in_reqs, in_count, + service_local_reqs(client, in_reqs, in_count, local_reqs, server_reqs, &server_count); /* return early if we satisfied all requests locally */ @@ -546,7 +552,8 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) qsort(server_reqs, server_count, sizeof(read_req_t), compare_read_req); /* create mread status for tracking completion */ - client_mread_status* mread = client_create_mread_request(server_count, + client_mread_status* mread = client_create_mread_request(client, + server_count, server_reqs); if (NULL == mread) { return ENOMEM; @@ -572,7 +579,8 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) mread_id, server_count, server_reqs); /* invoke multi-read rpc on server */ - read_rc = invoke_client_mread_rpc(mread_id, server_count, size, buffer); + read_rc = invoke_client_mread_rpc(client, mread_id, server_count, + size, buffer); free(buffer); if (read_rc != UNIFYFS_SUCCESS) { @@ -654,7 +662,7 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) } /* get file size for this file */ - off_t filesize_offt = unifyfs_gfid_filesize(req->gfid); + off_t filesize_offt = unifyfs_gfid_filesize(client, req->gfid); if (filesize_offt == (off_t)-1) { /* failed to get file size */ req->errcode = ENOENT; @@ -704,7 +712,7 @@ int process_gfid_reads(read_req_t* in_reqs, int in_count) /* if we attempted to service requests from our local extent map, * then we need to copy the resulting read requests from the local * and server arrays back into the user's original array */ - if (unifyfs_local_extents) { + if (client->use_local_extents) { /* TODO: would be nice to copy these back into the same order * in which we received them. */ diff --git a/client/src/client_read.h b/client/src/client_read.h index f7202f802..517ca09f9 100644 --- a/client/src/client_read.h +++ b/client/src/client_read.h @@ -16,16 +16,51 @@ #define _UNIFYFS_CLIENT_READ_H #include "unifyfs-internal.h" +#include "unifyfs_api_internal.h" +#include "unifyfs_fid.h" // headers for client-server RPCs #include "unifyfs_client_rpcs.h" #include "margo_client.h" +/* This structure defines a client read request for one file corresponding to + * the global file id (gfid). It describes a contiguous read extent starting + * at offset with given length. */ +typedef struct { + /* The read request parameters */ + int gfid; /* global id of file to be read */ + size_t offset; /* logical file offset */ + size_t length; /* requested number of bytes */ + char* buf; /* user buffer to place data */ + struct aiocb* aiocbp; /* user aiocb* from aio or listio */ + + /* These two variables define the byte offset range of the extent for + * which we filled valid data. + * If cover_begin_offset != 0, there is a gap at the beginning + * of the read extent that should be zero-filled. + * If cover_end_offset != (length - 1), it was a short read. */ + volatile size_t cover_begin_offset; + volatile size_t cover_end_offset; + + /* nread is the user-visible number of bytes read. Since this includes + * any gaps, nread should be set to (cover_end_offset + 1) when the + * read request has been fully serviced. */ + size_t nread; + + /* errcode holds any error code encountered during the read. + * The error may be an internal error value (unifyfs_rc_e) or a + * normal POSIX error code. It will be converted to a valid errno value + * for use in returning from the syscall. */ + int errcode; +} read_req_t; + /* Structure used by the client to track completion state for a * set of read requests submitted by a single client syscall. * The server will return data for each read request in separate * rpc calls. */ typedef struct { + unifyfs_client* client; + unsigned int id; /* unique id for this set of read requests */ unsigned int n_reads; /* number of read requests */ read_req_t* reqs; /* array of read requests */ @@ -36,19 +71,18 @@ typedef struct { volatile unsigned int n_error; /* number of requests that had errors */ } client_mread_status; -/* an arraylist to maintain the active mread requests for the client */ -extern arraylist_t* active_mreads; - /* Create a new mread request containing the n_reads requests provided * in read_reqs array */ -client_mread_status* client_create_mread_request(int n_reads, +client_mread_status* client_create_mread_request(unifyfs_client* client, + int n_reads, read_req_t* read_reqs); /* Remove the mread status */ int client_remove_mread_request(client_mread_status* mread); /* Retrieve the mread request corresponding to the given request_id */ -client_mread_status* client_get_mread_status(unsigned int request_id); +client_mread_status* client_get_mread_status(unifyfs_client* client, + unsigned int request_id); /* Update the mread status for the request at the given req_index. * If the request is now complete, update the request's completion state @@ -75,6 +109,8 @@ void update_read_req_coverage(read_req_t* req, size_t extent_length); /* process a set of client read requests */ -int process_gfid_reads(read_req_t* in_reqs, int in_count); +int process_gfid_reads(unifyfs_client* client, + read_req_t* in_reqs, + int in_count); #endif // UNIFYFS_CLIENT_READ_H diff --git a/client/src/client_transfer.c b/client/src/client_transfer.c index b8a6b3072..db5c1cfb4 100644 --- a/client/src/client_transfer.c +++ b/client/src/client_transfer.c @@ -13,330 +13,298 @@ */ #include "client_transfer.h" -#include "unifyfs-sysio.h" -static -int do_transfer_data(int fd_src, - int fd_dst, - off_t offset, - size_t count) -{ - int ret = UNIFYFS_SUCCESS; - int err; - off_t pos = 0; - ssize_t n_written = 0; - ssize_t n_read = 0; - ssize_t n_processed = 0; - size_t len = UNIFYFS_TX_BUFSIZE; - char* buf = NULL; - - buf = malloc(UNIFYFS_TX_BUFSIZE); - if (NULL == buf) { - LOGERR("failed to allocate transfer buffer"); - return ENOMEM; - } - errno = 0; - pos = UNIFYFS_WRAP(lseek)(fd_src, offset, SEEK_SET); - err = errno; - if (pos == (off_t) -1) { - LOGERR("lseek failed (%d: %s)\n", err, strerror(err)); - ret = err; - goto out; +static const char* invalid_str = "INVALID"; + +static const char* mode_copy_str = "COPY"; +static const char* mode_move_str = "MOVE"; +static const char* transfer_mode_str(unifyfs_transfer_mode mode) +{ + switch (mode) { + case UNIFYFS_TRANSFER_MODE_COPY: + return mode_copy_str; + case UNIFYFS_TRANSFER_MODE_MOVE: + return mode_move_str; + default: + return invalid_str; } + return NULL; +} - errno = 0; - pos = UNIFYFS_WRAP(lseek)(fd_dst, offset, SEEK_SET); - err = errno; - if (pos == (off_t) -1) { - LOGERR("lseek failed (%d: %s)\n", err, strerror(err)); - ret = err; - goto out; +static const char* state_canceled_str = "CANCELED"; +static const char* state_completed_str = "COMPLETED"; +static const char* state_inprogress_str = "IN-PROGRESS"; +static const char* transfer_state_str(unifyfs_ioreq_state state) +{ + switch (state) { + case UNIFYFS_IOREQ_STATE_IN_PROGRESS: + return state_inprogress_str; + case UNIFYFS_IOREQ_STATE_CANCELED: + return state_canceled_str; + case UNIFYFS_IOREQ_STATE_COMPLETED: + return state_completed_str; + default: + return invalid_str; } + return NULL; +} - while (count > n_processed) { - if (len > count) { - len = count; - } +#define debug_print_transfer_req(req) \ +do { \ + if (NULL != (req)) { \ + LOGDBG("transfer_req[%p] src=%s, dst=%s, mode=%s, parallel=%d" \ + " - id=%d, state=%s, result=%d, errcode=%d (%s)", \ + (req), (req)->src_path, (req)->dst_path, \ + transfer_mode_str((req)->mode), (req)->use_parallel, \ + (req)->_reqid, transfer_state_str((req)->state), \ + (req)->result.rc, (req)->result.error, \ + unifyfs_rc_enum_description((req)->result.error)); \ + } \ +} while (0) + +/* compute the arraylist index for the given request id. we use + * modulo operator to reuse slots in the list */ +static inline +unsigned int id_to_list_index(unifyfs_client* client, + unsigned int id) +{ + unsigned int capacity = (unsigned int) + arraylist_capacity(client->active_transfers); + return id % capacity; +} - errno = 0; - n_read = UNIFYFS_WRAP(read)(fd_src, buf, len); - err = errno; - if (n_read == 0) { /* EOF */ - break; - } else if (n_read < 0) { /* error */ - ret = err; - goto out; - } +/* Create a new transfer status for the given transfer request */ +int client_create_transfer(unifyfs_client* client, + unifyfs_transfer_request* req, + bool src_in_unify) +{ + if ((NULL == client) || (NULL == client->active_transfers)) { + LOGERR("client->active_transfers is NULL"); + return UNIFYFS_FAILURE; + } - do { - errno = 0; - n_written = UNIFYFS_WRAP(write)(fd_dst, buf, n_read); - err = errno; - if (n_written < 0) { - ret = err; - goto out; - } else if ((n_written == 0) && err && (err != EAGAIN)) { - ret = err; - goto out; - } + if (NULL == req) { + LOGERR("transfer req is NULL"); + return EINVAL; + } - n_read -= n_written; - n_processed += n_written; - } while (n_read > 0); + int active_count = arraylist_size(client->active_transfers); + if (active_count == arraylist_capacity(client->active_transfers)) { + /* already at full capacity for outstanding reads */ + LOGWARN("too many outstanding client transfers"); + return UNIFYFS_FAILURE; } -out: - if (NULL != buf) { - free(buf); + /* generate an id that doesn't conflict with another active mread */ + unsigned int transfer_id, req_ndx; + void* existing; + do { + transfer_id = client->transfer_id_generator++; + req_ndx = id_to_list_index(client, transfer_id); + existing = arraylist_get(client->active_transfers, req_ndx); + } while (existing != NULL); + + client_transfer_status* transfer = calloc(1, sizeof(*transfer)); + if (NULL == transfer) { + LOGERR("failed to allocate transfer status struct"); + return ENOMEM; } + transfer->client = client; + transfer->req = req; + transfer->complete = 0; + transfer->src_in_unify = src_in_unify; + ABT_mutex_create(&(transfer->sync)); + + int rc = arraylist_insert(client->active_transfers, + (int)req_ndx, (void*)transfer); + if (rc != 0) { + free(transfer); + return rc; + } + req->_reqid = transfer_id; + debug_print_transfer_req(req); - return ret; + return UNIFYFS_SUCCESS; } -int do_transfer_file_serial(const char* src, - const char* dst, - struct stat* sb_src, - int direction) +/* Remove the transfer status */ +client_transfer_status* client_get_transfer(unifyfs_client* client, + unsigned int transfer_id) { - /* NOTE: we currently do not use the @direction */ - - int err; - int ret = UNIFYFS_SUCCESS; - int fd_src = 0; - int fd_dst = 0; - - errno = 0; - fd_src = UNIFYFS_WRAP(open)(src, O_RDONLY); - err = errno; - if (fd_src < 0) { - LOGERR("failed to open() source file %s", src); - return err; + if ((NULL == client) || (NULL == client->active_transfers)) { + LOGERR("client->active_transfers is NULL"); + return NULL; } - errno = 0; - fd_dst = UNIFYFS_WRAP(open)(dst, O_WRONLY); - err = errno; - if (fd_dst < 0) { - LOGERR("failed to open() destination file %s", dst); - close(fd_src); - return err; + int list_index = (int) id_to_list_index(client, transfer_id); + void* list_item = arraylist_get(client->active_transfers, list_index); + if (list_item == NULL) { + LOGERR("client->active_transfers index=%d is NULL", list_index); + return NULL; } - LOGDBG("serial transfer (rank=%d of %d): length=%zu", - client_rank, global_rank_cnt, sb_src->st_size); + client_transfer_status* transfer = list_item; + return transfer; +} - ret = do_transfer_data(fd_src, fd_dst, 0, sb_src->st_size); - if (UNIFYFS_SUCCESS != ret) { - LOGERR("failed to transfer data (ret=%d, %s)", - ret, unifyfs_rc_enum_description(ret)); - } else { - UNIFYFS_WRAP(fsync)(fd_dst); +/* Check if the transfer has completed */ +bool client_check_transfer_complete(client_transfer_status* transfer) +{ + if ((NULL == transfer) || (NULL == transfer->req)) { + LOGERR("transfer is NULL"); + return false; } - UNIFYFS_WRAP(close)(fd_dst); - UNIFYFS_WRAP(close)(fd_src); + unifyfs_transfer_request* req = transfer->req; + //debug_print_transfer_req(req); + + bool is_complete = false; + + switch (req->state) { + case UNIFYFS_IOREQ_STATE_IN_PROGRESS: + ABT_mutex_lock(transfer->sync); + is_complete = (transfer->complete == 1); + ABT_mutex_unlock(transfer->sync); + break; + case UNIFYFS_IOREQ_STATE_CANCELED: + case UNIFYFS_IOREQ_STATE_COMPLETED: + is_complete = true; + break; + default: + break; + } - return ret; + return is_complete; } -int do_transfer_file_parallel(const char* src, - const char* dst, - struct stat* sb_src, - int direction) +/* Remove the transfer status */ +int client_cleanup_transfer(unifyfs_client* client, + client_transfer_status* transfer) { - /* NOTE: we currently do not use the @direction */ + if ((NULL == client) || (NULL == client->active_transfers)) { + LOGERR("client->active_transfers is NULL"); + return UNIFYFS_FAILURE; + } - int err; - int ret = UNIFYFS_SUCCESS; - int fd_src = 0; - int fd_dst = 0; - uint64_t total_chunks = 0; - uint64_t chunk_start = 0; - uint64_t n_chunks_remainder = 0; - uint64_t n_chunks_per_rank = 0; - uint64_t offset = 0; - uint64_t len = 0; - uint64_t size = sb_src->st_size; - uint64_t last_chunk_size = 0; - - /* calculate total number of chunk transfers */ - total_chunks = size / UNIFYFS_TX_BUFSIZE; - last_chunk_size = size % UNIFYFS_TX_BUFSIZE; - if (last_chunk_size) { - total_chunks++; + if ((NULL == transfer) || (NULL == transfer->req)) { + LOGERR("transfer status or request is NULL"); + return EINVAL; } - /* calculate chunks per rank */ - n_chunks_per_rank = total_chunks / global_rank_cnt; - n_chunks_remainder = total_chunks % global_rank_cnt; - - /* - * if the file is smaller than (rank_count * transfer_size), just - * use the serial mode. - * - * FIXME: is this assumption fair even for the large rank count? - */ - if (total_chunks <= (uint64_t)global_rank_cnt) { - if (client_rank == 0) { - LOGDBG("using serial transfer for small file"); - ret = do_transfer_file_serial(src, dst, sb_src, direction); - if (ret) { - LOGERR("do_transfer_file_serial() failed"); - } + unifyfs_transfer_request* req = transfer->req; + debug_print_transfer_req(req); + + if ((req->state == UNIFYFS_IOREQ_STATE_COMPLETED) && + (req->mode == UNIFYFS_TRANSFER_MODE_MOVE)) { + /* successful copy, now remove source */ + if (transfer->src_in_unify) { + unifyfs_remove(client, req->src_path); } else { - ret = UNIFYFS_SUCCESS; + LOGWARN("Not removing non-UnifyFS source file %s for " + "UNIFYFS_TRANSFER_MODE_MOVE", req->src_path); } - return ret; } - errno = 0; - fd_src = UNIFYFS_WRAP(open)(src, O_RDONLY); - err = errno; - if (fd_src < 0) { - LOGERR("failed to open() source file %s", src); - return err; + int list_index = (int) id_to_list_index(client, req->_reqid); + void* list_item = arraylist_remove(client->active_transfers, list_index); + if (list_item == (void*)transfer) { + ABT_mutex_free(&(transfer->sync)); + free(transfer); + return UNIFYFS_SUCCESS; + } else { + LOGERR("mismatch on client->active_transfers index=%d", list_index); + return UNIFYFS_FAILURE; } +} - errno = 0; - fd_dst = UNIFYFS_WRAP(open)(dst, O_WRONLY); - err = errno; - if (fd_dst < 0) { - LOGERR("failed to open() destination file %s", dst); - UNIFYFS_WRAP(close)(fd_src); - return err; +/* Update the transfer status for the client (app_id + client_id) + * transfer request (transfer_id) using the given error_code */ +int client_complete_transfer(unifyfs_client* client, + int transfer_id, + int error_code) +{ + if (NULL == client) { + LOGERR("NULL client"); + return EINVAL; } - chunk_start = n_chunks_per_rank * client_rank; - offset = chunk_start * UNIFYFS_TX_BUFSIZE; - len = n_chunks_per_rank * UNIFYFS_TX_BUFSIZE; - - LOGDBG("parallel transfer (rank=%d of %d): " - "#chunks=%zu, offset=%zu, length=%zu", - client_rank, global_rank_cnt, - (size_t)n_chunks_per_rank, (size_t)offset, (size_t)len); + client_transfer_status* transfer = client_get_transfer(client, + transfer_id); + if (NULL == transfer) { + LOGERR("failed to find client transfer with id=%d", transfer_id); + return EINVAL; + } - ret = do_transfer_data(fd_src, fd_dst, (off_t)offset, (size_t)len); - if (ret) { - LOGERR("failed to transfer data (ret=%d, %s)", - ret, unifyfs_rc_enum_description(ret)); - } else { - if (n_chunks_remainder && (client_rank < n_chunks_remainder)) { - /* do single chunk transfer per rank of remainder portion */ - len = UNIFYFS_TX_BUFSIZE; - if (last_chunk_size && (client_rank == (n_chunks_remainder - 1))) { - len = last_chunk_size; - } - chunk_start = (total_chunks - n_chunks_remainder) + client_rank; - offset = chunk_start * UNIFYFS_TX_BUFSIZE; - - LOGDBG("parallel transfer (rank=%d of %d): " - "#chunks=1, offset=%zu, length=%zu", - client_rank, global_rank_cnt, - (size_t)offset, (size_t)len); - ret = do_transfer_data(fd_src, fd_dst, (off_t)offset, (size_t)len); - if (ret) { - LOGERR("failed to transfer data (ret=%d, %s)", - ret, unifyfs_rc_enum_description(ret)); - } - } - fsync(fd_dst); + unifyfs_transfer_request* req = transfer->req; + if (NULL == req) { + LOGERR("found transfer status, but request is NULL - internal error"); + return UNIFYFS_FAILURE; } - UNIFYFS_WRAP(close)(fd_dst); - UNIFYFS_WRAP(close)(fd_src); + /* update the request status */ + ABT_mutex_lock(transfer->sync); + req->result.error = error_code; + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + transfer->complete = 1; + ABT_mutex_unlock(transfer->sync); - return ret; + return UNIFYFS_SUCCESS; } -int unifyfs_transfer_file(const char* src, - const char* dst, - int parallel) +int client_submit_transfers(unifyfs_client* client, + unifyfs_transfer_request* t_reqs, + size_t n_reqs) { - int rc, err; - int ret = 0; - int txdir = 0; - struct stat sb_src = { 0, }; - mode_t mode_no_write; - struct stat sb_dst = { 0, }; - int unify_src = 0; - int unify_dst = 0; - - char* src_path = strdup(src); - if (NULL == src_path) { - return -ENOMEM; - } - - char src_upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(src, src_upath)) { - txdir = UNIFYFS_TX_STAGE_OUT; - unify_src = 1; - } + int ret = UNIFYFS_SUCCESS; + int rc; - errno = 0; - rc = UNIFYFS_WRAP(stat)(src, &sb_src); - err = errno; - if (rc < 0) { - return -err; - } + for (size_t i = 0; i < n_reqs; i++) { + unifyfs_transfer_request* req = t_reqs + i; - char dst_path[UNIFYFS_MAX_FILENAME] = { 0, }; - char* pos = dst_path; - pos += sprintf(pos, "%s", dst); - - errno = 0; - rc = UNIFYFS_WRAP(stat)(dst, &sb_dst); - err = errno; - if (rc == 0 && S_ISDIR(sb_dst.st_mode)) { - /* if the given destination path is a directory, append the - * basename of the source file */ - sprintf(pos, "/%s", basename((char*) src_path)); - } + /* check for a valid transfer mode */ + switch (req->mode) { + case UNIFYFS_TRANSFER_MODE_COPY: + case UNIFYFS_TRANSFER_MODE_MOVE: + break; + default: + req->result.error = EINVAL; + req->result.rc = UNIFYFS_FAILURE; + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + continue; + } - char dst_upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(dst_path, dst_upath)) { - txdir = UNIFYFS_TX_STAGE_IN; - unify_dst = 1; - } + const char* src = req->src_path; + const char* dst = req->dst_path; + int parallel = req->use_parallel; - if (unify_src + unify_dst != 1) { - // we may fail the operation with EINVAL, but useful for testing - LOGDBG("WARNING: none of pathnames points to unifyfs volume"); - } + bool src_in_unify = is_unifyfs_path(client, src); + bool dst_in_unify = is_unifyfs_path(client, dst); - /* for both serial and parallel transfers, use rank 0 client to - * create the destination file using the source file's mode*/ - if (0 == client_rank) { - errno = 0; - int create_flags = O_CREAT | O_WRONLY | O_TRUNC; - int fd = UNIFYFS_WRAP(open)(dst_path, create_flags, sb_src.st_mode); - err = errno; - if (fd < 0) { - LOGERR("failed to create destination file %s", dst); - return -err; + /* either src or dst must be within client namespace, but not both */ + if ((!src_in_unify && !dst_in_unify) || + (src_in_unify && dst_in_unify)) { + rc = EINVAL; + } else { + req->state = UNIFYFS_IOREQ_STATE_IN_PROGRESS; + rc = client_create_transfer(client, req, src_in_unify); + if (UNIFYFS_SUCCESS == rc) { + if (src_in_unify) { + int gfid = unifyfs_generate_gfid(src); + rc = invoke_client_transfer_rpc(client, req->_reqid, + gfid, parallel, dst); + } else { + /* need to create dest file and copy all source data */ + rc = UNIFYFS_ERROR_NYI; + } + } } - close(fd); - } - if (parallel) { - rc = do_transfer_file_parallel(src_path, dst_path, &sb_src, txdir); - } else { - rc = do_transfer_file_serial(src_path, dst_path, &sb_src, txdir); - } - - if (rc != UNIFYFS_SUCCESS) { - ret = -unifyfs_rc_errno(rc); - } else { - ret = 0; - - /* If the destination file is in UnifyFS, then laminate it so that it - * will be readable by other clients. */ - if (unify_dst) { - /* remove the write bits from the source file's mode bits to set - * the new file mode. use chmod with the new mode to ask for file - * lamination. */ - mode_no_write = (sb_src.st_mode) & ~(0222); - UNIFYFS_WRAP(chmod)(dst_path, mode_no_write); + if (rc != UNIFYFS_SUCCESS) { + req->result.error = rc; + req->result.rc = UNIFYFS_FAILURE; + ret = UNIFYFS_FAILURE; + req->state = UNIFYFS_IOREQ_STATE_COMPLETED; } } diff --git a/client/src/client_transfer.h b/client/src/client_transfer.h index 09f5d0422..b3a76131d 100644 --- a/client/src/client_transfer.h +++ b/client/src/client_transfer.h @@ -13,24 +13,49 @@ */ #include "unifyfs-internal.h" +#include "unifyfs_api_internal.h" +#include "margo_client.h" -/* client transfer (stage-in/out) support */ +typedef struct transfer_request_status { + unifyfs_client* client; + unifyfs_transfer_request* req; -#define UNIFYFS_TX_BUFSIZE (8*(1<<20)) + /* set to 1 if source file is in client namespace, otherwise + * destination file is in the namespace */ + int src_in_unify; -enum { - UNIFYFS_TX_STAGE_OUT = 0, - UNIFYFS_TX_STAGE_IN = 1, - UNIFYFS_TX_SERIAL = 0, - UNIFYFS_TX_PARALLEL = 1, -}; + /* the following is for synchronizing access/updates to below state */ + ABT_mutex sync; + volatile unsigned int complete; /* has request completed? */ +} client_transfer_status; -int do_transfer_file_serial(const char* src, - const char* dst, - struct stat* sb_src, - int direction); -int do_transfer_file_parallel(const char* src, - const char* dst, - struct stat* sb_src, - int direction); +/* Create a new transfer status for the given transfer request and + * insert it into client->active_transfers arraylist */ +int client_create_transfer(unifyfs_client* client, + unifyfs_transfer_request* req, + bool src_in_unify); + +/* Retrieve the transfer status for request with the given id */ +client_transfer_status* client_get_transfer(unifyfs_client* client, + unsigned int transfer_id); + +/* Check if the transfer has completed */ +bool client_check_transfer_complete(client_transfer_status* transfer); + +/* Remove the transfer status from client->active_transfers arraylist */ +int client_cleanup_transfer(unifyfs_client* client, + client_transfer_status* transfer); + +/* Update the transfer status for the client (app_id + client_id) + * transfer request (transfer_id) using the given error_code */ +int client_complete_transfer(unifyfs_client* client, + int transfer_id, + int error_code); + +/* Given an array of transfer requests, submit each request after + * creating a transfer status structure for the request and storing it + * within client->active_transfers */ +int client_submit_transfers(unifyfs_client* client, + unifyfs_transfer_request* t_reqs, + size_t n_reqs); diff --git a/client/src/margo_client.c b/client/src/margo_client.c index e95c284a4..ef7ff89ff 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -20,6 +20,7 @@ #include "unifyfs_rpc_util.h" #include "margo_client.h" #include "client_read.h" +#include "client_transfer.h" /* global rpc context */ static client_rpc_context_t* client_rpc_context; // = NULL @@ -56,6 +57,7 @@ static void register_client_rpcs(client_rpc_context_t* ctx) CLIENT_REGISTER_RPC(metaset); CLIENT_REGISTER_RPC(metaget); CLIENT_REGISTER_RPC(filesize); + CLIENT_REGISTER_RPC(transfer); CLIENT_REGISTER_RPC(truncate); CLIENT_REGISTER_RPC(unlink); CLIENT_REGISTER_RPC(laminate); @@ -63,6 +65,7 @@ static void register_client_rpcs(client_rpc_context_t* ctx) CLIENT_REGISTER_RPC(mread); CLIENT_REGISTER_RPC_HANDLER(mread_req_data); CLIENT_REGISTER_RPC_HANDLER(mread_req_complete); + CLIENT_REGISTER_RPC_HANDLER(transfer_complete); #undef CLIENT_REGISTER_RPC #undef CLIENT_REGISTER_RPC_HANDLER @@ -73,6 +76,11 @@ int unifyfs_client_rpc_init(void) { hg_return_t hret; + if (NULL != client_rpc_context) { + /* already initialized */ + return UNIFYFS_SUCCESS; + } + /* lookup margo server address string, * should be something like: "na+sm://7170/0" */ char* svr_addr_string = rpc_lookup_local_server_addr(); @@ -198,8 +206,8 @@ static hg_handle_t create_handle(hg_id_t id) return handle; } -/* invokes the attach rpc function */ -int invoke_client_attach_rpc(unifyfs_cfg_t* clnt_cfg) +/* invokes the mount rpc function */ +int invoke_client_mount_rpc(unifyfs_client* client) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -207,14 +215,18 @@ int invoke_client_attach_rpc(unifyfs_cfg_t* clnt_cfg) } /* get handle to rpc function */ - hg_handle_t handle = create_handle(client_rpc_context->rpcs.attach_id); + hg_handle_t handle = create_handle(client_rpc_context->rpcs.mount_id); /* fill in input struct */ - unifyfs_attach_in_t in; - fill_client_attach_info(clnt_cfg, &in); + unifyfs_mount_in_t in; + in.dbg_rank = client->state.app_rank; + in.mount_prefix = strdup(client->cfg.unifyfs_mountpoint); + + /* pass our margo address to the server */ + in.client_addr_str = strdup(client_rpc_context->client_addr_str); /* call rpc function */ - LOGDBG("invoking the attach rpc function in client"); + LOGDBG("invoking the mount rpc function in client"); hg_return_t hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { LOGERR("margo_forward() failed"); @@ -222,13 +234,27 @@ int invoke_client_attach_rpc(unifyfs_cfg_t* clnt_cfg) return UNIFYFS_ERROR_MARGO; } + /* free memory on input struct */ + free((void*)in.mount_prefix); + free((void*)in.client_addr_str); + /* decode response */ int ret; - unifyfs_attach_out_t out; + unifyfs_mount_out_t out; hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; + if (ret == (int)UNIFYFS_SUCCESS) { + /* get assigned client id, and verify app_id */ + client->state.client_id = (int) out.client_id; + int srvr_app_id = (int) out.app_id; + if (client->state.app_id != srvr_app_id) { + LOGWARN("mismatch on app_id - using %d, server returned %d", + client->state.app_id, srvr_app_id); + } + LOGDBG("My client id is %d", client->state.client_id); + } margo_free_output(handle, &out); } else { LOGERR("margo_get_output() failed"); @@ -237,15 +263,37 @@ int invoke_client_attach_rpc(unifyfs_cfg_t* clnt_cfg) /* free resources */ margo_destroy(handle); - if (NULL != in.logio_spill_dir) { - free((void*)in.logio_spill_dir); - } return ret; } -/* invokes the mount rpc function */ -int invoke_client_mount_rpc(unifyfs_cfg_t* clnt_cfg) +/* Fill attach rpc input struct with client-side context info */ +static void fill_client_attach_info(unifyfs_client* client, + unifyfs_attach_in_t* in) +{ + in->app_id = client->state.app_id; + in->client_id = client->state.client_id; + in->shmem_super_size = client->state.shm_super_ctx->size; + in->meta_offset = client->state.write_index.index_offset; + in->meta_size = client->state.write_index.index_size; + + if (NULL != client->state.logio_ctx->shmem) { + in->logio_mem_size = client->state.logio_ctx->shmem->size; + } else { + in->logio_mem_size = 0; + } + + in->logio_spill_size = client->state.logio_ctx->spill_sz; + if (client->state.logio_ctx->spill_sz) { + in->logio_spill_dir = strdup(client->cfg.logio_spill_dir); + } else { + in->logio_spill_dir = NULL; + } +} + + +/* invokes the attach rpc function */ +int invoke_client_attach_rpc(unifyfs_client* client) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -253,17 +301,14 @@ int invoke_client_mount_rpc(unifyfs_cfg_t* clnt_cfg) } /* get handle to rpc function */ - hg_handle_t handle = create_handle(client_rpc_context->rpcs.mount_id); + hg_handle_t handle = create_handle(client_rpc_context->rpcs.attach_id); /* fill in input struct */ - unifyfs_mount_in_t in; - fill_client_mount_info(clnt_cfg, &in); - - /* pass our margo address to the server */ - in.client_addr_str = strdup(client_rpc_context->client_addr_str); + unifyfs_attach_in_t in; + fill_client_attach_info(client, &in); /* call rpc function */ - LOGDBG("invoking the mount rpc function in client"); + LOGDBG("invoking the attach rpc function in client"); hg_return_t hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { LOGERR("margo_forward() failed"); @@ -271,27 +316,13 @@ int invoke_client_mount_rpc(unifyfs_cfg_t* clnt_cfg) return UNIFYFS_ERROR_MARGO; } - /* free memory on input struct */ - free((void*)in.mount_prefix); - free((void*)in.client_addr_str); - /* decode response */ int ret; - unifyfs_mount_out_t out; + unifyfs_attach_out_t out; hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; - if (ret == (int)UNIFYFS_SUCCESS) { - /* get assigned client id, and verify app_id */ - unifyfs_client_id = (int) out.client_id; - int srvr_app_id = (int) out.app_id; - if (unifyfs_app_id != srvr_app_id) { - LOGWARN("mismatch on app_id - using %d, server returned %d", - unifyfs_app_id, srvr_app_id); - } - LOGDBG("My client id is %d", unifyfs_client_id); - } margo_free_output(handle, &out); } else { LOGERR("margo_get_output() failed"); @@ -300,12 +331,15 @@ int invoke_client_mount_rpc(unifyfs_cfg_t* clnt_cfg) /* free resources */ margo_destroy(handle); + if (NULL != in.logio_spill_dir) { + free((void*)in.logio_spill_dir); + } return ret; } /* function invokes the unmount rpc */ -int invoke_client_unmount_rpc(void) +int invoke_client_unmount_rpc(unifyfs_client* client) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -317,8 +351,8 @@ int invoke_client_unmount_rpc(void) /* fill in input struct */ unifyfs_unmount_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; /* call rpc function */ LOGDBG("invoking the unmount rpc function in client"); @@ -359,7 +393,8 @@ int invoke_client_unmount_rpc(void) * * f_meta: The metadata values to update. */ -int invoke_client_metaset_rpc(unifyfs_file_attr_op_e attr_op, +int invoke_client_metaset_rpc(unifyfs_client* client, + unifyfs_file_attr_op_e attr_op, unifyfs_file_attr_t* f_meta) { /* check that we have initialized margo */ @@ -372,8 +407,8 @@ int invoke_client_metaset_rpc(unifyfs_file_attr_op_e attr_op, /* fill in input struct */ unifyfs_metaset_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; in.attr_op = (int32_t) attr_op; memcpy(&(in.attr), f_meta, sizeof(*f_meta)); @@ -407,7 +442,9 @@ int invoke_client_metaset_rpc(unifyfs_file_attr_op_e attr_op, } /* invokes the client metaget rpc function */ -int invoke_client_metaget_rpc(int gfid, unifyfs_file_attr_t* file_meta) +int invoke_client_metaget_rpc(unifyfs_client* client, + int gfid, + unifyfs_file_attr_t* file_meta) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -419,9 +456,9 @@ int invoke_client_metaget_rpc(int gfid, unifyfs_file_attr_t* file_meta) /* fill in input struct */ unifyfs_metaget_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; - in.gfid = (int32_t)gfid; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; + in.gfid = (int32_t) gfid; /* call rpc function */ LOGDBG("invoking the metaget rpc function in client"); @@ -460,7 +497,9 @@ int invoke_client_metaget_rpc(int gfid, unifyfs_file_attr_t* file_meta) } /* invokes the client filesize rpc function */ -int invoke_client_filesize_rpc(int gfid, size_t* outsize) +int invoke_client_filesize_rpc(unifyfs_client* client, + int gfid, + size_t* outsize) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -472,8 +511,8 @@ int invoke_client_filesize_rpc(int gfid, size_t* outsize) /* fill in input struct */ unifyfs_filesize_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; in.gfid = (int32_t) gfid; /* call rpc function */ @@ -508,7 +547,61 @@ int invoke_client_filesize_rpc(int gfid, size_t* outsize) } /* invokes the client truncate rpc function */ -int invoke_client_truncate_rpc(int gfid, size_t filesize) +int invoke_client_transfer_rpc(unifyfs_client* client, + int transfer_id, + int gfid, + int parallel_transfer, + const char* dest_file) +{ + /* check that we have initialized margo */ + if (NULL == client_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* get handle to rpc function */ + hg_handle_t handle = create_handle(client_rpc_context->rpcs.transfer_id); + + /* fill in input struct */ + unifyfs_transfer_in_t in; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; + in.transfer_id = (int32_t) transfer_id; + in.gfid = (int32_t) gfid; + in.mode = (int32_t) parallel_transfer; + in.dst_file = (hg_const_string_t) dest_file; + + /* call rpc function */ + LOGDBG("invoking the transfer rpc function in client"); + hg_return_t hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; + unifyfs_transfer_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ + margo_destroy(handle); + + return ret; +} + +/* invokes the client truncate rpc function */ +int invoke_client_truncate_rpc(unifyfs_client* client, + int gfid, + size_t filesize) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -520,8 +613,8 @@ int invoke_client_truncate_rpc(int gfid, size_t filesize) /* fill in input struct */ unifyfs_truncate_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; in.gfid = (int32_t) gfid; in.filesize = (hg_size_t) filesize; @@ -554,7 +647,8 @@ int invoke_client_truncate_rpc(int gfid, size_t filesize) } /* invokes the client unlink rpc function */ -int invoke_client_unlink_rpc(int gfid) +int invoke_client_unlink_rpc(unifyfs_client* client, + int gfid) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -566,8 +660,8 @@ int invoke_client_unlink_rpc(int gfid) /* fill in input struct */ unifyfs_unlink_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; in.gfid = (int32_t) gfid; /* call rpc function */ @@ -599,7 +693,8 @@ int invoke_client_unlink_rpc(int gfid) } /* invokes the client-to-server laminate rpc function */ -int invoke_client_laminate_rpc(int gfid) +int invoke_client_laminate_rpc(unifyfs_client* client, + int gfid) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -611,8 +706,8 @@ int invoke_client_laminate_rpc(int gfid) /* fill in input struct */ unifyfs_laminate_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; in.gfid = (int32_t) gfid; /* call rpc function */ @@ -644,7 +739,8 @@ int invoke_client_laminate_rpc(int gfid) } /* invokes the client sync rpc function */ -int invoke_client_sync_rpc(int gfid) +int invoke_client_sync_rpc(unifyfs_client* client, + int gfid) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -656,8 +752,8 @@ int invoke_client_sync_rpc(int gfid) /* fill in input struct */ unifyfs_fsync_in_t in; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; in.gfid = (int32_t) gfid; /* call rpc function */ @@ -689,8 +785,11 @@ int invoke_client_sync_rpc(int gfid) } /* invokes the client mread rpc function */ -int invoke_client_mread_rpc(unsigned int reqid, int read_count, - size_t extents_size, void* extents_buffer) +int invoke_client_mread_rpc(unifyfs_client* client, + unsigned int reqid, + int read_count, + size_t extents_size, + void* extents_buffer) { /* check that we have initialized margo */ if (NULL == client_rpc_context) { @@ -711,8 +810,8 @@ int invoke_client_mread_rpc(unsigned int reqid, int read_count, /* fill input struct */ in.mread_id = (int32_t) reqid; - in.app_id = (int32_t) unifyfs_app_id; - in.client_id = (int32_t) unifyfs_client_id; + in.app_id = (int32_t) client->state.app_id; + in.client_id = (int32_t) client->state.client_id; in.read_count = (int32_t) read_count; in.bulk_size = (hg_size_t) extents_size; @@ -754,6 +853,7 @@ static void unifyfs_mread_req_data_rpc(hg_handle_t handle) { int ret = UNIFYFS_SUCCESS; + /* get input params */ unifyfs_mread_req_data_in_t in; hg_return_t hret = margo_get_input(handle, &in); @@ -762,9 +862,13 @@ static void unifyfs_mread_req_data_rpc(hg_handle_t handle) ret = UNIFYFS_ERROR_MARGO; } else { /* lookup client mread request */ + unifyfs_client* client; + int client_app = (int) in.app_id; + int client_id = (int) in.client_id; int client_mread = (int) in.mread_id; - LOGDBG("looking up mread[%d]", client_mread); - client_mread_status* mread = client_get_mread_status(client_mread); + client = unifyfs_find_client(client_app, client_id, NULL); + client_mread_status* mread = client_get_mread_status(client, + client_mread); if (NULL == mread) { /* unknown client request */ ret = EINVAL; @@ -808,11 +912,12 @@ static void unifyfs_mread_req_data_rpc(hg_handle_t handle) * supports, and a large bulk transfer may result in * failure. */ int i = 0; + hg_size_t offset, len; hg_size_t remain = in.bulk_size; + hg_size_t max_bulk = UNIFYFS_SERVER_MAX_BULK_TX_SIZE; do { - hg_size_t offset = i * MAX_BULK_TX_SIZE; - hg_size_t len = remain < MAX_BULK_TX_SIZE ? - remain : MAX_BULK_TX_SIZE; + offset = i * max_bulk; + len = (remain < max_bulk) ? remain : max_bulk; hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, in.bulk_data, offset, @@ -876,9 +981,13 @@ static void unifyfs_mread_req_complete_rpc(hg_handle_t handle) ret = UNIFYFS_ERROR_MARGO; } else { /* lookup client mread request */ + unifyfs_client* client; + int client_app = (int) in.app_id; + int client_id = (int) in.client_id; int client_mread = (int) in.mread_id; - LOGDBG("looking up mread[%d]", client_mread); - client_mread_status* mread = client_get_mread_status(client_mread); + client = unifyfs_find_client(client_app, client_id, NULL); + client_mread_status* mread = client_get_mread_status(client, + client_mread); if (NULL == mread) { /* unknown client request */ ret = EINVAL; @@ -911,3 +1020,50 @@ static void unifyfs_mread_req_complete_rpc(hg_handle_t handle) margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(unifyfs_mread_req_complete_rpc) + +/* for client transfer request identified by transfer_id, + * update request completion state according to input params */ +static void unifyfs_transfer_complete_rpc(hg_handle_t handle) +{ + int ret = UNIFYFS_SUCCESS; + + /* get input params */ + unifyfs_transfer_complete_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* lookup client mread request */ + unifyfs_client* client; + int client_app = (int) in.app_id; + int client_id = (int) in.client_id; + int transfer_id = (int) in.transfer_id; + int error_code = (int) in.error_code; + client = unifyfs_find_client(client_app, client_id, NULL); + if (NULL == client) { + /* unknown client */ + ret = EINVAL; + } else { + /* Update the transfer state */ + ret = client_complete_transfer(client, transfer_id, error_code); + } + margo_free_input(handle, &in); + } + + /* set rpc result status */ + unifyfs_transfer_complete_out_t out; + out.ret = ret; + + LOGDBG("responding"); + + /* return to caller */ + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_transfer_complete_rpc) diff --git a/client/src/margo_client.h b/client/src/margo_client.h index b6f6469bf..cdc296a11 100644 --- a/client/src/margo_client.h +++ b/client/src/margo_client.h @@ -19,9 +19,9 @@ * margo_client.h - client-server margo RPCs ********************************************/ -#include -#include "unifyfs_meta.h" +#include "unifyfs_api_internal.h" #include "unifyfs_client_rpcs.h" +#include typedef struct ClientRpcIds { hg_id_t attach_id; @@ -30,6 +30,7 @@ typedef struct ClientRpcIds { hg_id_t metaset_id; hg_id_t metaget_id; hg_id_t filesize_id; + hg_id_t transfer_id; hg_id_t truncate_id; hg_id_t unlink_id; hg_id_t laminate_id; @@ -37,6 +38,7 @@ typedef struct ClientRpcIds { hg_id_t mread_id; hg_id_t mread_req_data_id; hg_id_t mread_req_complete_id; + hg_id_t transfer_complete_id; } client_rpcs_t; typedef struct ClientRpcContext { @@ -52,32 +54,48 @@ int unifyfs_client_rpc_init(void); int unifyfs_client_rpc_finalize(void); -void fill_client_attach_info(unifyfs_cfg_t* clnt_cfg, - unifyfs_attach_in_t* in); -int invoke_client_attach_rpc(unifyfs_cfg_t* clnt_cfg); +int invoke_client_attach_rpc(unifyfs_client* client); -void fill_client_mount_info(unifyfs_cfg_t* clnt_cfg, - unifyfs_mount_in_t* in); -int invoke_client_mount_rpc(unifyfs_cfg_t* clnt_cfg); +int invoke_client_mount_rpc(unifyfs_client* client); -int invoke_client_unmount_rpc(void); +int invoke_client_unmount_rpc(unifyfs_client* client); -int invoke_client_metaset_rpc(unifyfs_file_attr_op_e attr_op, +int invoke_client_metaset_rpc(unifyfs_client* client, + unifyfs_file_attr_op_e attr_op, unifyfs_file_attr_t* f_meta); -int invoke_client_metaget_rpc(int gfid, unifyfs_file_attr_t* f_meta); +int invoke_client_metaget_rpc(unifyfs_client* client, + int gfid, + unifyfs_file_attr_t* f_meta); + +int invoke_client_filesize_rpc(unifyfs_client* client, + int gfid, + size_t* filesize); + +int invoke_client_laminate_rpc(unifyfs_client* client, + int gfid); -int invoke_client_filesize_rpc(int gfid, size_t* filesize); +int invoke_client_mread_rpc(unifyfs_client* client, + unsigned int reqid, + int read_count, + size_t extents_size, + void* extents_buffer); -int invoke_client_truncate_rpc(int gfid, size_t filesize); +int invoke_client_sync_rpc(unifyfs_client* client, + int gfid); -int invoke_client_unlink_rpc(int gfid); +int invoke_client_transfer_rpc(unifyfs_client* client, + int transfer_id, + int gfid, + int parallel_transfer, + const char* dest_file); -int invoke_client_laminate_rpc(int gfid); +int invoke_client_truncate_rpc(unifyfs_client* client, + int gfid, + size_t filesize); -int invoke_client_sync_rpc(int gfid); +int invoke_client_unlink_rpc(unifyfs_client* client, + int gfid); -int invoke_client_mread_rpc(unsigned int reqid, int read_count, - size_t extents_size, void* extents_buffer); #endif // MARGO_CLIENT_H diff --git a/client/src/pmpi_wrappers.c b/client/src/pmpi_wrappers.c index c0e9c730b..ebcfef2f8 100644 --- a/client/src/pmpi_wrappers.c +++ b/client/src/pmpi_wrappers.c @@ -14,6 +14,8 @@ #include +#include +#include #include "pmpi_wrappers.h" #include "unifyfs.h" @@ -24,7 +26,6 @@ int unifyfs_mpi_init(int* argc, char*** argv) int rc, ret; int rank; int world_sz = 0; - int app_id = 0; //fprintf(stderr, "DEBUG: %s - before PMPI_Init()\n", __func__); @@ -36,10 +37,15 @@ int unifyfs_mpi_init(int* argc, char*** argv) //fprintf(stderr, "DEBUG: %s - after PMPI_Init(), rank=%d ret=%d\n", // __func__, rank, ret); - rc = unifyfs_mount("/unifyfs", rank, (size_t)world_sz, app_id); + char* mountpoint = getenv("UNIFYFS_MOUNTPOINT"); + if (NULL == mountpoint) { + mountpoint = strdup("/unifyfs"); + } + + rc = unifyfs_mount(mountpoint, rank, (size_t)world_sz); if (UNIFYFS_SUCCESS != rc) { - fprintf(stderr, "UNIFYFS ERROR: unifyfs_mount() failed with '%s'\n", - unifyfs_rc_enum_description((unifyfs_rc)rc)); + fprintf(stderr, "UNIFYFS ERROR: unifyfs_mount(%s) failed with '%s'\n", + mountpoint, unifyfs_rc_enum_description((unifyfs_rc)rc)); } return ret; diff --git a/client/src/posix_client.c b/client/src/posix_client.c new file mode 100644 index 000000000..dbdbfea99 --- /dev/null +++ b/client/src/posix_client.c @@ -0,0 +1,917 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "posix_client.h" +#include "unifyfs_fid.h" +#include "unifyfs_wrap.h" +#include "unifyfs-sysio.h" + +#ifdef USE_SPATH +#include +#endif + +/* ------------------------------- + * Global variable declarations + * ------------------------------- */ + +unifyfs_client* posix_client; // = NULL + +int global_rank_cnt; /* count of world ranks */ +int client_rank; /* client-provided rank (for debugging) */ + +/* avoid duplicate mounts (for now) */ +int unifyfs_mount_id = -1; + +/* have we initialized? */ +int unifyfs_initialized; // = 0 + +/* whether we can use fgetpos/fsetpos */ +int unifyfs_fpos_enabled = 1; + +/* array of file descriptors */ +unifyfs_fd_t unifyfs_fds[UNIFYFS_CLIENT_MAX_FILES]; +rlim_t unifyfs_fd_limit; + +/* array of file streams */ +unifyfs_stream_t unifyfs_streams[UNIFYFS_CLIENT_MAX_FILES]; + +/* + * TODO: the number of open directories clearly won't exceed the number of + * file descriptors. however, the current MAX_FILES value of 128 will + * quickly run out. if this value is fixed to be reasonably larger, then we + * would need a way to dynamically allocate the dirstreams instead of the + * following fixed size array. + */ + +/* array of DIR* streams to be used */ +unifyfs_dirstream_t unifyfs_dirstreams[UNIFYFS_CLIENT_MAX_FILES]; + +/* stack to track free file descriptor values, + * each is an index into unifyfs_fds array */ +void* posix_fd_stack; + +/* stack to track free file streams, + * each is an index into unifyfs_streams array */ +void* posix_stream_stack; + +/* stack to track free directory streams, + * each is an index into unifyfs_dirstreams array */ +void* posix_dirstream_stack; + +/* mutex to lock stack operations */ +pthread_mutex_t posix_stack_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* ------------------------------- + * Static variable declarations + * ------------------------------- */ + +static int use_single_shm; // = 0 + +static off_t unifyfs_max_offt; +static off_t unifyfs_min_offt; +static off_t unifyfs_max_long; +static off_t unifyfs_min_long; + + +/* ------------------------------- + * Utility functions + * ------------------------------- */ + +/* single function to route all unsupported wrapper calls through */ +void unifyfs_vunsupported(const char* fn_name, + const char* file, + int line, + const char* fmt, + va_list args) +{ + /* print a message about where in the UNIFYFS code we are */ + printf("*** UnifyFS WARNING *** UNSUPPORTED I/O FUNCTION: " + "%s() at %s:%d: ", fn_name, file, line); + + /* print string with more info about call, e.g., param values */ + va_list args2; + va_copy(args2, args); + vprintf(fmt, args2); + va_end(args2); + + /* TODO: should we provide a config option to abort in this case?*/ +} + +void unifyfs_unsupported(const char* fn_name, + const char* file, + int line, + const char* fmt, + ...) +{ + /* print string with more info about call, e.g., param values */ + va_list args; + va_start(args, fmt); + unifyfs_vunsupported(fn_name, file, line, fmt, args); + va_end(args); +} + +/* returns 1 if two input parameters will overflow their type when + * added together */ +int unifyfs_would_overflow_offt(off_t a, off_t b) +{ + /* if both parameters are positive, they could overflow when + * added together */ + if (a > 0 && b > 0) { + /* if the distance between a and max is greater than or equal to + * b, then we could add a and b and still not exceed max */ + if (unifyfs_max_offt - a >= b) { + return 0; + } + return 1; + } + + /* if both parameters are negative, they could underflow when + * added together */ + if (a < 0 && b < 0) { + /* if the distance between min and a is less than or equal to + * b, then we could add a and b and still not exceed min */ + if (unifyfs_min_offt - a <= b) { + return 0; + } + return 1; + } + + /* if a and b are mixed signs or at least one of them is 0, + * then adding them together will produce a result closer to 0 + * or at least no further away than either value already is */ + return 0; +} + +/* returns 1 if two input parameters will overflow their type when + * added together */ +int unifyfs_would_overflow_long(long a, long b) +{ + /* if both parameters are positive, they could overflow when + * added together */ + if (a > 0 && b > 0) { + /* if the distance between a and max is greater than or equal to + * b, then we could add a and b and still not exceed max */ + if (unifyfs_max_long - a >= b) { + return 0; + } + return 1; + } + + /* if both parameters are negative, they could underflow when + * added together */ + if (a < 0 && b < 0) { + /* if the distance between min and a is less than or equal to + * b, then we could add a and b and still not exceed min */ + if (unifyfs_min_long - a <= b) { + return 0; + } + return 1; + } + + /* if a and b are mixed signs or at least one of them is 0, + * then adding them together will produce a result closer to 0 + * or at least no further away than either value already is */ + return 0; +} + +/* lock access to shared data structures in superblock */ +int posix_stack_lock(void) +{ + if (use_single_shm) { + return pthread_mutex_lock(&posix_stack_mutex); + } + return 0; +} + +/* unlock access to shared data structures in superblock */ +int posix_stack_unlock(void) +{ + if (use_single_shm) { + return pthread_mutex_unlock(&posix_stack_mutex); + } + return 0; +} + +void unifyfs_normalize_path(const char* path, char* normalized) +{ + /* if we have a relative path, prepend the current working directory */ + if ((path[0] != '/') && (posix_client->cwd != NULL)) { + /* got a relative path, add our cwd */ + snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s/%s", + posix_client->cwd, path); + } else { + snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", path); + } + +#ifdef USE_SPATH + /* normalize path to handle '.', '..', + * and extra or trailing '/' characters */ + char* str = spath_strdup_reduce_str(normalized); + snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", str); + free(str); +#endif /* USE_SPATH */ +} + +/* Given a path, which may relative or absolute, + * return 1 if we should intercept the path, 0 otherwise. + * If path is to be intercepted, returned a normalized version in upath. */ +int unifyfs_intercept_path(const char* path, char* upath) +{ + /* don't intercept anything until we're initialized */ + if (!unifyfs_initialized) { + return 0; + } + + /* if we have a relative path, prepend the current working directory */ + char target[UNIFYFS_MAX_FILENAME]; + unifyfs_normalize_path(path, target); + + /* if the path starts with our mount point, intercept it */ + int intercept = 0; + if (strncmp(target, unifyfs_mount_prefix, unifyfs_mount_prefixlen) == 0) { + /* characters in target up through mount point match, + * assume we match */ + intercept = 1; + + /* if we have another character, it must be '/' */ + if (strlen(target) > unifyfs_mount_prefixlen && + target[unifyfs_mount_prefixlen] != '/') { + intercept = 0; + } + } + + /* copy normalized path into upath */ + if (intercept) { + strncpy(upath, target, UNIFYFS_MAX_FILENAME); + } + + return intercept; +} + +/* given an fd, return 1 if we should intercept this file, 0 otherwise, + * convert fd to new fd value if needed */ +int unifyfs_intercept_fd(int* fd) +{ + int oldfd = *fd; + + /* don't intercept anything until we're initialized */ + if (!unifyfs_initialized) { + return 0; + } + + if (oldfd < unifyfs_fd_limit) { + /* this fd is a real system fd, so leave it as is */ + return 0; + } else if (oldfd < 0) { + /* this is an invalid fd, so we should not intercept it */ + return 0; + } else { + /* this is an fd we generated and returned to the user, + * so intercept the call and shift the fd */ + int newfd = oldfd - unifyfs_fd_limit; + *fd = newfd; + LOGDBG("Changing fd from exposed %d to internal %d", oldfd, newfd); + return 1; + } +} + +/* given a file stream, return 1 if we should intercept this file, + * 0 otherwise */ +int unifyfs_intercept_stream(FILE* stream) +{ + /* don't intercept anything until we're initialized */ + if (!unifyfs_initialized) { + return 0; + } + + /* check whether this pointer lies within range of our + * file stream array */ + unifyfs_stream_t* ptr = (unifyfs_stream_t*) stream; + unifyfs_stream_t* start = unifyfs_streams; + unifyfs_stream_t* end = start + UNIFYFS_CLIENT_MAX_FILES; + if (ptr >= start && ptr < end) { + return 1; + } + + return 0; +} + +/* given an directory stream, return 1 if we should intercept this + * direcotry, 0 otherwise */ +int unifyfs_intercept_dirstream(DIR* dirp) +{ + /* don't intercept anything until we're initialized */ + if (!unifyfs_initialized) { + return 0; + } + + /* check whether this pointer lies within range of our + * directory stream array */ + + unifyfs_dirstream_t* ptr = (unifyfs_dirstream_t*) dirp; + unifyfs_dirstream_t* start = unifyfs_dirstreams; + unifyfs_dirstream_t* end = start + UNIFYFS_CLIENT_MAX_FILES; + if (ptr >= start && ptr < end) { + return 1; + } + + return 0; +} + +/* initialize file descriptor structure for given fd value */ +int unifyfs_fd_init(int fd) +{ + /* get pointer to file descriptor struct for this fd value */ + unifyfs_fd_t* filedesc = &(unifyfs_fds[fd]); + + /* set fid to -1 to indicate fd is not active, + * set file position to max value, + * disable read and write flags */ + filedesc->fid = -1; + filedesc->pos = (off_t) -1; + filedesc->read = 0; + filedesc->write = 0; + + return UNIFYFS_SUCCESS; +} + +/* initialize file streams structure for given sid value */ +int unifyfs_stream_init(int sid) +{ + /* get pointer to file stream struct for this id value */ + unifyfs_stream_t* s = &(unifyfs_streams[sid]); + + /* record our id so when given a pointer to the stream + * struct we can easily recover our id value */ + s->sid = sid; + + /* set fd to -1 to indicate stream is not active */ + s->fd = -1; + + return UNIFYFS_SUCCESS; +} + +/* initialize directory streams structure for given dirid value */ +int unifyfs_dirstream_init(int dirid) +{ + /* get pointer to directory stream struct for this id value */ + unifyfs_dirstream_t* dirp = &(unifyfs_dirstreams[dirid]); + + /* initialize fields in structure */ + memset((void*) dirp, 0, sizeof(*dirp)); + + /* record our id so when given a pointer to the stream + * struct we can easily recover our id value */ + dirp->dirid = dirid; + + /* set fid to -1 to indicate stream is not active */ + dirp->fid = -1; + + return UNIFYFS_SUCCESS; +} + +/* given a file descriptor, return the file id */ +int unifyfs_get_fid_from_fd(int fd) +{ + /* check that file descriptor is within range */ + if (fd < 0 || fd >= UNIFYFS_CLIENT_MAX_FILES) { + return -1; + } + + /* get local file id that file descriptor is assocated with, + * will be -1 if not active */ + int fid = unifyfs_fds[fd].fid; + return fid; +} + +/* return address of file descriptor structure or NULL if fd is out + * of range */ +unifyfs_fd_t* unifyfs_get_filedesc_from_fd(int fd) +{ + if (fd >= 0 && fd < UNIFYFS_CLIENT_MAX_FILES) { + unifyfs_fd_t* filedesc = &(unifyfs_fds[fd]); + return filedesc; + } + return NULL; +} + + +/* given a file descriptor, return 1 if file is laminated, + * and 0 otherwise */ +int unifyfs_fd_is_laminated(int fd) +{ + int fid = unifyfs_get_fid_from_fd(fd); + int laminated = unifyfs_fid_is_laminated(posix_client, fid); + return laminated; +} + + + +/* ------------------------------- + * POSIX client management + * ------------------------------- */ + +int posix_client_init(void) +{ + int rc; + int i; + + if (!unifyfs_initialized) { + + unifyfs_handle fshdl; + rc = unifyfs_initialize(unifyfs_mount_prefix, NULL, 0, &fshdl); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("client initialization failed - %s", + unifyfs_rc_enum_description(rc)); + return rc; + } + posix_client = (unifyfs_client*) fshdl; + assert(NULL != posix_client); + + posix_client->state.app_rank = client_rank; + +#ifdef UNIFYFS_GOTCHA + rc = setup_gotcha_wrappers(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to setup gotcha wrappers"); + return rc; + } +#endif + + /* as a hack to support fgetpos/fsetpos, we store the value of + * a void* in an fpos_t so check that there's room and at least + * print a message if this won't work */ + if (sizeof(fpos_t) < sizeof(void*)) { + LOGERR("fgetpos/fsetpos will not work correctly"); + unifyfs_fpos_enabled = 0; + } + + /* compute min and max off_t values */ + unsigned long long bits = sizeof(off_t) * 8; + unifyfs_max_offt = (off_t)((1ULL << (bits - 1ULL)) - 1ULL); + unifyfs_min_offt = (off_t)(-(1ULL << (bits - 1ULL))); + + /* compute min and max long values */ + unifyfs_max_long = LONG_MAX; + unifyfs_min_long = LONG_MIN; + + /* record the max fd for the system */ + /* RLIMIT_NOFILE specifies a value one greater than the maximum + * file descriptor number that can be opened by this process */ + struct rlimit r_limit; + + if (getrlimit(RLIMIT_NOFILE, &r_limit) < 0) { + LOGERR("getrlimit failed: errno=%d (%s)", errno, strerror(errno)); + return UNIFYFS_FAILURE; + } + unifyfs_fd_limit = r_limit.rlim_cur; + LOGDBG("FD limit for system = %ld", unifyfs_fd_limit); + + /* initialize file descriptor structures */ + int num_fds = UNIFYFS_CLIENT_MAX_FILES; + for (i = 0; i < num_fds; i++) { + unifyfs_fd_init(i); + } + + /* initialize file stream structures */ + int num_streams = UNIFYFS_CLIENT_MAX_FILES; + for (i = 0; i < num_streams; i++) { + unifyfs_stream_init(i); + } + + /* initialize directory stream structures */ + int num_dirstreams = UNIFYFS_CLIENT_MAX_FILES; + for (i = 0; i < num_dirstreams; i++) { + unifyfs_dirstream_init(i); + } + + /* initialize stack of free fd values */ + size_t free_fd_size = unifyfs_stack_bytes(num_fds); + posix_fd_stack = malloc(free_fd_size); + unifyfs_stack_init(posix_fd_stack, num_fds); + + /* initialize stack of free stream values */ + size_t free_stream_size = unifyfs_stack_bytes(num_streams); + posix_stream_stack = malloc(free_stream_size); + unifyfs_stack_init(posix_stream_stack, num_streams); + + /* initialize stack of free directory stream values */ + size_t free_dirstream_size = unifyfs_stack_bytes(num_dirstreams); + posix_dirstream_stack = malloc(free_dirstream_size); + unifyfs_stack_init(posix_dirstream_stack, num_dirstreams); + + /* remember that we've now initialized the library */ + unifyfs_initialized = 1; + unifyfs_mount_id = posix_client->state.app_id; + } + + return UNIFYFS_SUCCESS; +} + +/* free resources allocated during posix_client_init(). + * generally, we do this in reverse order with respect to + * how things were initialized */ +int posix_client_fini(void) +{ + int rc = UNIFYFS_SUCCESS; + + if (!unifyfs_initialized) { + /* not initialized yet, so we shouldn't call finalize */ + return UNIFYFS_FAILURE; + } + + unifyfs_handle fshdl = (unifyfs_handle) posix_client; + rc = unifyfs_finalize(fshdl); + if (UNIFYFS_SUCCESS != rc) { + return rc; + } + + /* free directory stream stack */ + if (posix_dirstream_stack != NULL) { + free(posix_dirstream_stack); + posix_dirstream_stack = NULL; + } + + /* free file stream stack */ + if (posix_stream_stack != NULL) { + free(posix_stream_stack); + posix_stream_stack = NULL; + } + + /* free file descriptor stack */ + if (posix_fd_stack != NULL) { + free(posix_fd_stack); + posix_fd_stack = NULL; + } + + /* no longer initialized, so update the flag */ + unifyfs_initialized = 0; + unifyfs_mount_id = -1; + posix_client = NULL; + + return rc; +} + +static +int do_transfer_data(int fd_src, + int fd_dst, + off_t offset, + size_t count) +{ + int ret = UNIFYFS_SUCCESS; + int err; + off_t pos = 0; + ssize_t n_written = 0; + ssize_t n_read = 0; + ssize_t n_processed = 0; + size_t len = UNIFYFS_TRANSFER_BUF_SIZE; + char* buf = NULL; + + buf = malloc(UNIFYFS_TRANSFER_BUF_SIZE); + if (NULL == buf) { + LOGERR("failed to allocate transfer buffer"); + return ENOMEM; + } + + errno = 0; + pos = UNIFYFS_WRAP(lseek)(fd_src, offset, SEEK_SET); + err = errno; + if (pos == (off_t) -1) { + LOGERR("lseek failed (%d: %s)\n", err, strerror(err)); + ret = err; + goto out; + } + + errno = 0; + pos = UNIFYFS_WRAP(lseek)(fd_dst, offset, SEEK_SET); + err = errno; + if (pos == (off_t) -1) { + LOGERR("lseek failed (%d: %s)\n", err, strerror(err)); + ret = err; + goto out; + } + + while (count > n_processed) { + if (len > count) { + len = count; + } + + errno = 0; + n_read = UNIFYFS_WRAP(read)(fd_src, buf, len); + err = errno; + if (n_read == 0) { /* EOF */ + break; + } else if (n_read < 0) { /* error */ + ret = err; + goto out; + } + + do { + errno = 0; + n_written = UNIFYFS_WRAP(write)(fd_dst, buf, n_read); + err = errno; + if (n_written < 0) { + ret = err; + goto out; + } else if ((n_written == 0) && err && (err != EAGAIN)) { + ret = err; + goto out; + } + + n_read -= n_written; + n_processed += n_written; + } while (n_read > 0); + } + +out: + if (NULL != buf) { + free(buf); + } + + return ret; +} + +int transfer_file_serial(const char* src, + const char* dst, + struct stat* sb_src, + int direction) +{ + /* NOTE: we currently do not use the @direction */ + + int err; + int ret = UNIFYFS_SUCCESS; + int fd_src = 0; + int fd_dst = 0; + + errno = 0; + fd_src = UNIFYFS_WRAP(open)(src, O_RDONLY); + err = errno; + if (fd_src < 0) { + LOGERR("failed to open() source file %s", src); + return err; + } + + errno = 0; + fd_dst = UNIFYFS_WRAP(open)(dst, O_WRONLY); + err = errno; + if (fd_dst < 0) { + LOGERR("failed to open() destination file %s", dst); + close(fd_src); + return err; + } + + LOGDBG("serial transfer (rank=%d of %d): length=%zu", + client_rank, global_rank_cnt, sb_src->st_size); + + ret = do_transfer_data(fd_src, fd_dst, 0, sb_src->st_size); + if (UNIFYFS_SUCCESS != ret) { + LOGERR("failed to transfer data (ret=%d, %s)", + ret, unifyfs_rc_enum_description(ret)); + } else { + UNIFYFS_WRAP(fsync)(fd_dst); + } + + UNIFYFS_WRAP(close)(fd_dst); + UNIFYFS_WRAP(close)(fd_src); + + return ret; +} + +int transfer_file_parallel(const char* src, + const char* dst, + struct stat* sb_src, + int direction) +{ + /* NOTE: we currently do not use the @direction */ + + int err; + int ret = UNIFYFS_SUCCESS; + int fd_src = 0; + int fd_dst = 0; + uint64_t total_chunks = 0; + uint64_t chunk_start = 0; + uint64_t n_chunks_remainder = 0; + uint64_t n_chunks_per_rank = 0; + uint64_t offset = 0; + uint64_t len = 0; + uint64_t size = sb_src->st_size; + uint64_t last_chunk_size = 0; + + /* calculate total number of chunk transfers */ + total_chunks = size / UNIFYFS_TRANSFER_BUF_SIZE; + last_chunk_size = size % UNIFYFS_TRANSFER_BUF_SIZE; + if (last_chunk_size) { + total_chunks++; + } + + /* calculate chunks per rank */ + n_chunks_per_rank = total_chunks / global_rank_cnt; + n_chunks_remainder = total_chunks % global_rank_cnt; + + /* + * if the file is smaller than (rank_count * transfer_size), just + * use the serial mode. + * + * FIXME: is this assumption fair even for the large rank count? + */ + if (total_chunks <= (uint64_t)global_rank_cnt) { + if (client_rank == 0) { + LOGDBG("using serial transfer for small file"); + ret = transfer_file_serial(src, dst, sb_src, direction); + if (ret) { + LOGERR("transfer_file_serial() failed"); + } + } else { + ret = UNIFYFS_SUCCESS; + } + return ret; + } + + errno = 0; + fd_src = UNIFYFS_WRAP(open)(src, O_RDONLY); + err = errno; + if (fd_src < 0) { + LOGERR("failed to open() source file %s", src); + return err; + } + + errno = 0; + fd_dst = UNIFYFS_WRAP(open)(dst, O_WRONLY); + err = errno; + if (fd_dst < 0) { + LOGERR("failed to open() destination file %s", dst); + UNIFYFS_WRAP(close)(fd_src); + return err; + } + + chunk_start = n_chunks_per_rank * client_rank; + offset = chunk_start * UNIFYFS_TRANSFER_BUF_SIZE; + len = n_chunks_per_rank * UNIFYFS_TRANSFER_BUF_SIZE; + + LOGDBG("parallel transfer (rank=%d of %d): " + "#chunks=%zu, offset=%zu, length=%zu", + client_rank, global_rank_cnt, + (size_t)n_chunks_per_rank, (size_t)offset, (size_t)len); + + ret = do_transfer_data(fd_src, fd_dst, (off_t)offset, (size_t)len); + if (ret) { + LOGERR("failed to transfer data (ret=%d, %s)", + ret, unifyfs_rc_enum_description(ret)); + } else { + if (n_chunks_remainder && (client_rank < n_chunks_remainder)) { + /* do single chunk transfer per rank of remainder portion */ + len = UNIFYFS_TRANSFER_BUF_SIZE; + if (last_chunk_size && (client_rank == (n_chunks_remainder - 1))) { + len = last_chunk_size; + } + chunk_start = (total_chunks - n_chunks_remainder) + client_rank; + offset = chunk_start * UNIFYFS_TRANSFER_BUF_SIZE; + + LOGDBG("parallel transfer (rank=%d of %d): " + "#chunks=1, offset=%zu, length=%zu", + client_rank, global_rank_cnt, + (size_t)offset, (size_t)len); + ret = do_transfer_data(fd_src, fd_dst, (off_t)offset, (size_t)len); + if (ret) { + LOGERR("failed to transfer data (ret=%d, %s)", + ret, unifyfs_rc_enum_description(ret)); + } + } + fsync(fd_dst); + } + + UNIFYFS_WRAP(close)(fd_dst); + UNIFYFS_WRAP(close)(fd_src); + + return ret; +} + +int unifyfs_transfer_file(const char* src, + const char* dst, + int mode) +{ + int rc, err; + int ret = 0; + + int unify_src = 0; + int unify_dst = 0; + int direction = 0; + + char* src_path = NULL; + char* dst_path = NULL; + + char src_upath[UNIFYFS_MAX_FILENAME] = {0}; + char dst_upath[UNIFYFS_MAX_FILENAME] = {0}; + + struct stat sb_src = {0}; + struct stat sb_dst = {0}; + + if (unifyfs_intercept_path(src, src_upath)) { + direction = UNIFYFS_TRANSFER_DIRECTION_OUT; + unify_src = 1; + src_path = strdup(src_upath); + } else { + src_path = strdup(src); + } + if (NULL == src_path) { + return -ENOMEM; + } + + errno = 0; + rc = UNIFYFS_WRAP(stat)(src_path, &sb_src); + err = errno; + if (rc < 0) { + free(src_path); + return -err; + } + + if (unifyfs_intercept_path(dst, dst_upath)) { + direction = UNIFYFS_TRANSFER_DIRECTION_IN; + unify_dst = 1; + dst_path = strdup(dst_upath); + } else { + /* check if destination is a directory */ + errno = 0; + rc = UNIFYFS_WRAP(stat)(dst, &sb_dst); + err = errno; + if (rc == 0 && S_ISDIR(sb_dst.st_mode)) { + /* if destination path is a directory, append the + * basename of the source file */ + char* src_base = basename(src_path); + size_t dst_len = strlen(dst) + strlen(src_base) + 2; + dst_path = (char*) malloc(dst_len); + snprintf(dst_path, dst_len, "%s/%s", dst, src_base); + } else { + dst_path = strdup(dst); + } + } + if (NULL == dst_path) { + free(src_path); + return -ENOMEM; + } + + if ((unify_src + unify_dst) == 0) { + /* this should still work, but print a warning */ + LOGWARN("neither source nor destination are UnifyFS files"); + } else if ((unify_src + unify_dst) == 2) { + /* both not allowed, fail the operation with EINVAL */ + LOGERR("both source and destination are UnifyFS files"); + free(src_path); + free(dst_path); + return -EINVAL; + } + + /* for both serial and parallel transfers, use rank 0 client to + * create the destination file using the source file's mode */ + if (0 == client_rank) { + errno = 0; + int create_flags = O_CREAT | O_WRONLY | O_TRUNC; + int fd = UNIFYFS_WRAP(open)(dst_path, create_flags, sb_src.st_mode); + err = errno; + if (fd < 0) { + LOGERR("failed to create destination file %s", dst_path); + free(src_path); + free(dst_path); + return -err; + } + close(fd); + } + + if (mode == UNIFYFS_TRANSFER_PARALLEL) { + rc = transfer_file_parallel(src_path, dst_path, &sb_src, direction); + } else { + rc = transfer_file_serial(src_path, dst_path, &sb_src, direction); + } + + if (rc != UNIFYFS_SUCCESS) { + ret = -unifyfs_rc_errno(rc); + } else { + ret = 0; + + /* If the destination file is in UnifyFS, then laminate it so that it + * will be readable by other clients. */ + if (unify_dst) { + /* remove the write bits from the source file's mode bits to set + * the new file mode. use chmod with the new mode to ask for file + * lamination. */ + mode_t no_write = (sb_src.st_mode) & ~(0222); + UNIFYFS_WRAP(chmod)(dst_path, no_write); + } + } + + free(src_path); + free(dst_path); + + return ret; +} diff --git a/client/src/posix_client.h b/client/src/posix_client.h new file mode 100644 index 000000000..c510f1498 --- /dev/null +++ b/client/src/posix_client.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_POSIX_CLIENT_H +#define UNIFYFS_POSIX_CLIENT_H + +#include "unifyfs.h" +#include "unifyfs-internal.h" +#include "unifyfs_api_internal.h" + +/* ---------------------------------------- + * Structure and enumeration declarations + * ---------------------------------------- */ + +/* structure to represent file descriptors */ +typedef struct { + int fid; /* local file id associated with fd */ + off_t pos; /* current file pointer */ + int read; /* whether file is opened for read */ + int write; /* whether file is opened for write */ + int append; /* whether file is opened for append */ +} unifyfs_fd_t; + +enum unifyfs_stream_orientation { + UNIFYFS_STREAM_ORIENTATION_NULL = 0, + UNIFYFS_STREAM_ORIENTATION_BYTE, + UNIFYFS_STREAM_ORIENTATION_WIDE, +}; + +/* structure to represent FILE* streams */ +typedef struct { + int sid; /* index within unifyfs_streams */ + int err; /* stream error indicator flag */ + int eof; /* stream end-of-file indicator flag */ + int fd; /* file descriptor associated with stream */ + int append; /* whether file is opened in append mode */ + int orient; /* stream orientation {NULL, BYTE, WIDE} */ + + void* buf; /* pointer to buffer */ + int buffree; /* whether we need to free buffer */ + int buftype; /* fully buffered, line buffered, or unbuffered */ + size_t bufsize; /* size of buffer in bytes */ + off_t bufpos; /* byte offset in file corresponding to start of buffer */ + size_t buflen; /* number of bytes active in buffer */ + size_t bufdirty; /* whether data in buffer needs to be flushed */ + + unsigned char* ubuf; /* ungetc buffer (we store bytes from end) */ + size_t ubufsize; /* size of ungetc buffer in bytes */ + size_t ubuflen; /* number of active bytes in buffer */ + + unsigned char* _p; /* pointer to character in buffer */ + size_t _r; /* number of bytes left at pointer */ +} unifyfs_stream_t; + +/* structure to represent DIR* streams */ +typedef struct { + int dirid; /* index within unifyfs_dirstreams */ + int fid; /* local file id of directory for this stream */ + int fd; /* file descriptor associated with stream */ + off_t pos; /* position within directory stream */ +} unifyfs_dirstream_t; + + +/* client transfer (stage-in/out) support */ + +#define UNIFYFS_TRANSFER_BUF_SIZE (8 * MIB) + +enum { + UNIFYFS_TRANSFER_DIRECTION_OUT = 0, + UNIFYFS_TRANSFER_DIRECTION_IN = 1 +}; + + +/* ------------------------------- + * Global variable declarations + * ------------------------------- */ + +extern unifyfs_client* posix_client; + +extern int global_rank_cnt; /* count of world ranks */ +extern int client_rank; /* client-provided rank (for debugging) */ + +/* keep track of whether we have initialized & mounted */ +extern char* unifyfs_mount_prefix; +extern size_t unifyfs_mount_prefixlen; +extern int unifyfs_initialized; +extern int unifyfs_mount_id; /* gfid of mountpoint */ + +/* whether we can use fgetpos/fsetpos */ +extern int unifyfs_fpos_enabled; + +/* array of file descriptors */ +extern unifyfs_fd_t unifyfs_fds[UNIFYFS_CLIENT_MAX_FILES]; +extern rlim_t unifyfs_fd_limit; + +/* array of file streams */ +extern unifyfs_stream_t unifyfs_streams[UNIFYFS_CLIENT_MAX_FILES]; + +/* array of directory streams */ +extern unifyfs_dirstream_t unifyfs_dirstreams[UNIFYFS_CLIENT_MAX_FILES]; + +/* stack of free file descriptor values, + * each is an index into unifyfs_fds array */ +extern void* posix_fd_stack; + +/* stack of free streams, + * each is an index into unifyfs_streams array */ +extern void* posix_stream_stack; + +/* stack of directory streams, + * each is an index into unifyfs_dirstreams array */ +extern void* posix_dirstream_stack; + +/* mutex to lock stack operations */ +extern pthread_mutex_t posix_stack_mutex; + +/* ------------------------------- + * Initialization and finalizatio + * ------------------------------- */ + +/* Initialize UnifyFS as a POSIX client */ +int posix_client_init(void); + +/* Finalize POSIX client */ +int posix_client_fini(void); + + +/* ------------------------------- + * Common functions + * ------------------------------- */ + +int posix_stack_lock(void); + +int posix_stack_unlock(void); + +/* returns 1 if two input parameters will overflow their type when + * added together */ +int unifyfs_would_overflow_offt(off_t a, off_t b); + +/* returns 1 if two input parameters will overflow their type when + * added together */ +int unifyfs_would_overflow_long(long a, long b); + +/* sets flag if the path should be intercept as a unifyfs path, + * and if so, writes normalized path in upath, which should + * be a buffer of size UNIFYFS_MAX_FILENAME */ +int unifyfs_intercept_path(const char* path, char* upath); + +/* given an fd, return 1 if we should intercept this file, 0 otherwise, + * convert fd to new fd value if needed */ +int unifyfs_intercept_fd(int* fd); + +/* given a FILE*, returns 1 if we should intercept this file, + * 0 otherwise */ +int unifyfs_intercept_stream(FILE* stream); + +/* given a DIR*, returns 1 if we should intercept this directory, + * 0 otherwise */ +int unifyfs_intercept_dirstream(DIR* dirp); + +/* given a file descriptor, return the file id */ +int unifyfs_get_fid_from_fd(int fd); + +/* initialze file descriptor structure corresponding to fd value */ +int unifyfs_fd_init(int fd); + +/* initialze file stream structure corresponding to id value */ +int unifyfs_stream_init(int sid); + +/* initialze directory stream descriptor structure + * corresponding to id value */ +int unifyfs_dirstream_init(int dirid); + +/* return address of file descriptor structure or NULL if fd is out + * of range */ +unifyfs_fd_t* unifyfs_get_filedesc_from_fd(int fd); + +/* Return 1 if fd is laminated, 0 if not */ +int unifyfs_fd_is_laminated(int fd); + +/* transfer src file to dst using a single client */ +int transfer_file_serial(const char* src, + const char* dst, + struct stat* sb_src, + int direction); + +/* transfer src file to dst using all clients */ +int transfer_file_parallel(const char* src, + const char* dst, + struct stat* sb_src, + int direction); + + + +#endif /* UNIFYFS_POSIX_CLIENT_H */ diff --git a/client/src/unifyfs-dirops.c b/client/src/unifyfs-dirops.c index 80b7e9f7d..84f5240e4 100644 --- a/client/src/unifyfs-dirops.c +++ b/client/src/unifyfs-dirops.c @@ -13,13 +13,15 @@ */ #include "unifyfs-sysio.h" +#include "posix_client.h" +#include "unifyfs_fid.h" /* given a file id corresponding to a directory, * allocate and initialize a directory stream */ -static inline unifyfs_dirstream_t* unifyfs_dirstream_alloc(int fid) +static unifyfs_dirstream_t* unifyfs_dirstream_alloc(int fid) { /* allocate a file descriptor for this stream */ - int fd = unifyfs_stack_pop(unifyfs_fd_stack); + int fd = unifyfs_stack_pop(posix_fd_stack); if (fd < 0) { /* exhausted our file descriptors */ errno = EMFILE; @@ -27,11 +29,11 @@ static inline unifyfs_dirstream_t* unifyfs_dirstream_alloc(int fid) } /* allocate a directory stream id */ - int dirid = unifyfs_stack_pop(unifyfs_dirstream_stack); + int dirid = unifyfs_stack_pop(posix_dirstream_stack); if (dirid < 0) { /* exhausted our directory streams, * return our file descriptor and set errno */ - unifyfs_stack_push(unifyfs_fd_stack, fd); + unifyfs_stack_push(posix_fd_stack, fd); errno = EMFILE; return NULL; } @@ -65,21 +67,21 @@ static inline unifyfs_dirstream_t* unifyfs_dirstream_alloc(int fid) } /* release resources allocated in unifyfs_dirstream_alloc */ -static inline int unifyfs_dirstream_free(unifyfs_dirstream_t* dirp) +static int unifyfs_dirstream_free(unifyfs_dirstream_t* dirp) { /* reinit file descriptor to indicate that it's no longer in use, * not really necessary, but should help find bugs */ unifyfs_fd_init(dirp->fd); /* return file descriptor to the free stack */ - unifyfs_stack_push(unifyfs_fd_stack, dirp->fd); + unifyfs_stack_push(posix_fd_stack, dirp->fd); /* reinit dir stream to indicate that it's no longer in use, * not really necessary, but should help find bugs */ unifyfs_dirstream_init(dirp->dirid); /* return our index to directory stream stack */ - unifyfs_stack_push(unifyfs_dirstream_stack, dirp->dirid); + unifyfs_stack_push(posix_dirstream_stack, dirp->dirid); return UNIFYFS_SUCCESS; } @@ -99,11 +101,11 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) * if valid, populate the local file meta cache accordingly. */ - int fid = unifyfs_get_fid_from_path(upath); + int fid = unifyfs_fid_from_path(posix_client, upath); int gfid = unifyfs_generate_gfid(upath); unifyfs_file_attr_t gfattr = { 0, }; - int ret = unifyfs_get_global_file_meta(gfid, &gfattr); + int ret = unifyfs_get_global_file_meta(posix_client, gfid, &gfattr); if (ret != UNIFYFS_SUCCESS) { errno = ENOENT; return NULL; @@ -120,7 +122,7 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) unifyfs_filemeta_t* meta = NULL; if (fid >= 0) { - meta = unifyfs_get_meta_from_fid(fid); + meta = unifyfs_get_meta_from_fid(posix_client, fid); assert(meta != NULL); /* @@ -128,18 +130,18 @@ DIR* UNIFYFS_WRAP(opendir)(const char* name) * global metadb. is it safe to invalidate the local entry and * re-populate with the global data? */ - if (!unifyfs_fid_is_dir(fid)) { + if (!unifyfs_fid_is_dir(posix_client, fid)) { errno = ENOTDIR; return NULL; } } else { - fid = unifyfs_fid_create_file(upath, 0); + fid = unifyfs_fid_create_file(posix_client, upath, 0); if (fid < 0) { errno = unifyfs_rc_errno(-fid); return NULL; } - meta = unifyfs_get_meta_from_fid(fid); + meta = unifyfs_get_meta_from_fid(posix_client, fid); assert(meta != NULL); /* set as directory */ diff --git a/client/src/unifyfs-dirops.h b/client/src/unifyfs-dirops.h index 876360391..cfa56ad67 100644 --- a/client/src/unifyfs-dirops.h +++ b/client/src/unifyfs-dirops.h @@ -12,10 +12,11 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -#ifndef __UNIFYFS_DIROPS_H -#define __UNIFYFS_DIROPS_H +#ifndef UNIFYFS_DIROPS_H +#define UNIFYFS_DIROPS_H #include "unifyfs-internal.h" +#include "unifyfs_wrap.h" /* * FIXME: is this portable to use the linux dirent structure? @@ -48,5 +49,5 @@ UNIFYFS_DECL(scandir, int, (const char* dirp, struct dirent** namelist, const struct dirent**))); UNIFYFS_DECL(seekdir, void, (DIR* dirp, long loc)); -#endif /* __UNIFYFS_DIROPS_H */ +#endif /* UNIFYFS_DIROPS_H */ diff --git a/client/src/unifyfs-fixed.c b/client/src/unifyfs-fixed.c deleted file mode 100644 index abb9ec01f..000000000 --- a/client/src/unifyfs-fixed.c +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICENSE for full license text. - */ - -/* - * Copyright (c) 2013, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * code Written by - * Raghunath Rajachandrasekar - * Kathryn Mohror - * Adam Moody - * All rights reserved. - * This file is part of CRUISE. - * For details, see https://github.com/hpc/cruise - * Please also read this file LICENSE.CRUISE - */ - -#include "unifyfs-internal.h" -#include "unifyfs-fixed.h" -#include "unifyfs_log.h" -#include "margo_client.h" -#include "seg_tree.h" - -/* --------------------------------------- - * Operations on client write index - * --------------------------------------- */ - -/* - * Clear all entries in the log index. This only clears the metadata, - * not the data itself. - */ -static void clear_index(void) -{ - *unifyfs_indices.ptr_num_entries = 0; -} - -/* Add the metadata for a single write to the index */ -static int add_write_meta_to_index(unifyfs_filemeta_t* meta, - off_t file_pos, - off_t log_pos, - size_t length) -{ - /* add write extent to our segment trees */ - if (unifyfs_local_extents) { - /* record write extent in our local cache */ - seg_tree_add(&meta->extents, - file_pos, - file_pos + length - 1, - log_pos); - } - - /* - * We want to make sure this write will not overflow the maximum - * number of index entries we can sync with server. A write can at most - * create two new nodes in the seg_tree. If we're close to potentially - * filling up the index, sync it out. - */ - unsigned long count_before = seg_tree_count(&meta->extents_sync); - if (count_before >= (unifyfs_max_index_entries - 2)) { - /* this will flush our segments, sync them, and set the running - * segment count back to 0 */ - unifyfs_sync_extents(meta->fid); - } - - /* store the write in our segment tree used for syncing with server. */ - seg_tree_add(&meta->extents_sync, - file_pos, - file_pos + length - 1, - log_pos); - - return UNIFYFS_SUCCESS; -} - -/* - * Remove all entries in the current index and re-write it using the write - * metadata stored in the target file's extents_sync segment tree. This only - * re-writes the metadata in the index. All the actual data is still kept - * in the write log and will be referenced correctly by the new metadata. - * - * After this function is done, 'unifyfs_indices' will have been totally - * re-written. The writes in the index will be flattened, non-overlapping, - * and sequential. The extents_sync segment tree will be cleared. - * - * This function is called when we sync our extents with the server. - * - * Returns maximum write log offset for synced extents. - */ -off_t unifyfs_rewrite_index_from_seg_tree(unifyfs_filemeta_t* meta) -{ - /* get pointer to index buffer */ - unifyfs_index_t* indexes = unifyfs_indices.index_entry; - - /* Erase the index before we re-write it */ - clear_index(); - - /* count up number of entries we wrote to buffer */ - unsigned long idx = 0; - - /* record maximum write log offset */ - off_t max_log_offset = 0; - - int gfid = meta->attrs.gfid; - - seg_tree_rdlock(&meta->extents_sync); - /* For each write in this file's seg_tree ... */ - struct seg_tree_node* node = NULL; - while ((node = seg_tree_iter(&meta->extents_sync, node))) { - indexes[idx].file_pos = node->start; - indexes[idx].log_pos = node->ptr; - indexes[idx].length = node->end - node->start + 1; - indexes[idx].gfid = gfid; - idx++; - if ((off_t)(node->end) > max_log_offset) { - max_log_offset = (off_t) node->end; - } - } - seg_tree_unlock(&meta->extents_sync); - /* All done processing this files writes. Clear its seg_tree */ - seg_tree_clear(&meta->extents_sync); - - /* record total number of entries in index buffer */ - *unifyfs_indices.ptr_num_entries = idx; - - return max_log_offset; -} - -/* - * Find any write extents that span or exceed truncation point and remove them. - * - * This function is called when we truncate a file and there are cached writes. - */ -int truncate_write_meta(unifyfs_filemeta_t* meta, off_t trunc_sz) -{ - if (0 == trunc_sz) { - /* All writes should be removed. Clear extents_sync */ - seg_tree_clear(&meta->extents_sync); - - if (unifyfs_local_extents) { - /* Clear the local extent cache too */ - seg_tree_clear(&meta->extents); - } - return UNIFYFS_SUCCESS; - } - - unsigned long trunc_off = (unsigned long) trunc_sz; - int rc = seg_tree_remove(&meta->extents_sync, trunc_off, ULONG_MAX); - if (unifyfs_local_extents) { - rc = seg_tree_remove(&meta->extents, trunc_off, ULONG_MAX); - } - if (rc) { - LOGERR("removal of write extents due to truncation failed"); - rc = UNIFYFS_FAILURE; - } else { - rc = UNIFYFS_SUCCESS; - } - return rc; -} - - -/* - * Sync all the write extents for the target file(s) to the server. - * The target_fid identifies a specific file, or all files (-1). - * Clears the metadata index afterwards. - * - * Returns 0 on success, nonzero otherwise. - */ -int unifyfs_sync_extents(int target_fid) -{ - int tmp_rc; - int ret = UNIFYFS_SUCCESS; - - /* if caller gave us a file id, sync that specific fid */ - if (target_fid >= 0) { - /* user named a specific file id, lookup its metadata */ - int fid = target_fid; - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if ((NULL == meta) || (meta->fid != fid)) { - /* bail out with an error if we fail to find it */ - LOGERR("missing filemeta for fid=%d", fid); - return UNIFYFS_FAILURE; - } - - /* sync with server if we need to */ - if (meta->needs_sync) { - /* write contents from segment tree to index buffer */ - off_t max_log_off = unifyfs_rewrite_index_from_seg_tree(meta); - - /* if there are no index entries, we've got nothing to sync */ - if (*unifyfs_indices.ptr_num_entries == 0) { - /* consider that we've sync'd successfully */ - meta->needs_sync = 0; - return UNIFYFS_SUCCESS; - } - - /* ensure any data written to the spillover file is flushed */ - off_t logio_shmem_size; - unifyfs_logio_get_sizes(logio_ctx, &logio_shmem_size, NULL); - if (max_log_off >= logio_shmem_size) { - /* some extents range into spill over area, - * so flush data to spill over file */ - tmp_rc = unifyfs_logio_sync(logio_ctx); - if (UNIFYFS_SUCCESS != tmp_rc) { - LOGERR("failed to sync logio data"); - ret = tmp_rc; - } - LOGDBG("after logio spill sync"); - } - - /* tell the server to grab our new extents */ - tmp_rc = invoke_client_sync_rpc(meta->attrs.gfid); - if (UNIFYFS_SUCCESS != tmp_rc) { - /* something went wrong when trying to flush extents */ - LOGERR("failed to flush write index to server for gfid=%d", - meta->attrs.gfid); - ret = tmp_rc; - } - - /* we've sync'd, so mark this file as being up-to-date */ - meta->needs_sync = 0; - - /* flushed, clear buffer and refresh number of entries - * and number remaining */ - clear_index(); - } - - return ret; - } - - /* to get here, caller specified target_fid = -1, - * so sync every file descriptor */ - for (int i = 0; i < UNIFYFS_CLIENT_MAX_FILEDESCS; i++) { - /* get file id for each file descriptor */ - int fid = unifyfs_fds[i].fid; - if (-1 == fid) { - /* file descriptor is not currently in use */ - continue; - } - - /* got an open file, sync this file id */ - tmp_rc = unifyfs_sync_extents(fid); - if (UNIFYFS_SUCCESS != tmp_rc) { - ret = tmp_rc; - } - } - - return ret; -} - -/* --------------------------------------- - * Operations on file storage - * --------------------------------------- */ - -/** - * Write data to file using log-based I/O - * - * @param fid file id to write to - * @param meta metadata for file - * @param pos file position to start writing at - * @param buf user buffer holding data - * @param count number of bytes to write - * @param nwritten number of bytes written - * @return UNIFYFS_SUCCESS, or error code - */ -int unifyfs_fid_logio_write(int fid, - unifyfs_filemeta_t* meta, - off_t pos, - const void* buf, - size_t count, - size_t* nwritten) -{ - /* assume we'll fail to write anything */ - *nwritten = 0; - - assert(meta != NULL); - if (meta->storage != FILE_STORAGE_LOGIO) { - LOGERR("file (fid=%d) storage mode != FILE_STORAGE_LOGIO", fid); - return EINVAL; - } - - /* allocate space in the log for this write */ - off_t log_off; - int rc = unifyfs_logio_alloc(logio_ctx, count, &log_off); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("logio_alloc(%zu) failed", count); - return rc; - } - - /* do the write */ - rc = unifyfs_logio_write(logio_ctx, log_off, count, buf, nwritten); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("logio_write(%zu, %zu) failed", log_off, count); - return rc; - } - - if (*nwritten < count) { - LOGWARN("partial logio_write() @ offset=%zu (%zu of %zu bytes)", - (size_t)log_off, *nwritten, count); - } else { - LOGDBG("fid=%d pos=%zu - successful logio_write() " - "@ log offset=%zu (%zu bytes)", - fid, (size_t)pos, (size_t)log_off, count); - } - - /* update our write metadata for this write */ - rc = add_write_meta_to_index(meta, pos, log_off, *nwritten); - return rc; -} diff --git a/client/src/unifyfs-fixed.h b/client/src/unifyfs-fixed.h deleted file mode 100644 index 288ff923c..000000000 --- a/client/src/unifyfs-fixed.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* - * Copyright (c) 2017, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * Copyright (c) 2017, Florida State University. Contributions from - * the Computer Architecture and Systems Research Laboratory (CASTL) - * at the Department of Computer Science. - * - * Written by: Teng Wang, Adam Moody, Weikuan Yu, Kento Sato, Kathryn Mohror - * LLNL-CODE-728877. All rights reserved. - * - * This file is part of burstfs. - * For details, see https://github.com/llnl/burstfs - * Please read https://github.com/llnl/burstfs/LICENSE for full license text. - */ - -/* - * Copyright (c) 2013, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * code Written by - * Raghunath Rajachandrasekar - * Kathryn Mohror - * Adam Moody - * All rights reserved. - * This file is part of CRUISE. - * For details, see https://github.com/hpc/cruise - * Please also read this file LICENSE.CRUISE - */ - -#ifndef UNIFYFS_FIXED_H -#define UNIFYFS_FIXED_H - -#include "unifyfs-internal.h" - -/* rewrite client's shared memory index of file write extents */ -off_t unifyfs_rewrite_index_from_seg_tree(unifyfs_filemeta_t* meta); - -/* remove/truncate write extents in client metadata */ -int truncate_write_meta(unifyfs_filemeta_t* meta, off_t trunc_sz); - -/* sync all writes for target file(s) with the server */ -int unifyfs_sync_extents(int target_fid); - -/* write data to file using log-based I/O */ -int unifyfs_fid_logio_write( - int fid, /* file id to write to */ - unifyfs_filemeta_t* meta, /* meta data for file */ - off_t pos, /* file position to start writing at */ - const void* buf, /* user buffer holding data */ - size_t count, /* number of bytes to write */ - size_t* nwritten /* returns number of bytes written */ -); - -#endif /* UNIFYFS_FIXED_H */ diff --git a/client/src/unifyfs-internal.h b/client/src/unifyfs-internal.h index 000381c68..0e2f625e9 100644 --- a/client/src/unifyfs-internal.h +++ b/client/src/unifyfs-internal.h @@ -111,466 +111,5 @@ #include "utlist.h" #include "uthash.h" -/* ------------------------------- - * Defines and types - * ------------------------------- - */ - -/* define a macro to capture function name, file name, and line number - * along with user-defined string */ -#define UNIFYFS_UNSUPPORTED(fmt, args...) \ - unifyfs_unsupported(__func__, __FILE__, __LINE__, fmt, ##args) - -#ifdef UNIFYFS_GOTCHA -#include - -/* the name of our wrapper - we use __wrap_ instead of */ -#define UNIFYFS_WRAP(name) __wrap_ ## name - -/* the name of the real function pointer */ -#define UNIFYFS_REAL(name) __real_ ## name - -/* declare anything that will be used externally */ -#define UNIFYFS_DECL(name, ret, args) \ - extern gotcha_wrappee_handle_t wrappee_handle_ ## name; \ - extern ret (*__real_ ## name) args; \ - ret __wrap_ ## name args - -/* ask gotcha for the address of the real function */ -#define MAP_OR_FAIL(name) \ -do { \ - if (NULL == __real_ ## name) { \ - __real_ ## name = gotcha_get_wrappee(wrappee_handle_ ## name); \ - if (NULL == __real_ ## name) { \ - assert(!"missing Gotcha wrappee for " #name); \ - } \ - } \ -} while (0) - -int setup_gotcha_wrappers(void); - -#elif UNIFYFS_PRELOAD - -/* =================================================================== - * Using LD_PRELOAD to intercept - * =================================================================== - * we need to use the same function names the application is calling, - * and we then invoke the real library function after looking it up with - * dlsym */ - -/* we need the dlsym function */ -#include - -/* define a static variable called __real_open to record address of - * real open call and initialize it to NULL */ -#define UNIFYFS_DECL(name,ret,args) \ - static ret (*__real_ ## name)args = NULL; - -/* our open wrapper assumes the name of open() */ -#define UNIFYFS_WRAP(name) name - -/* the address of the real open call is stored in __real_open variable */ -#define UNIFYFS_REAL(name) __real_ ## name - -/* if __real_open is still NULL, call dlsym to lookup address of real - * function and record it */ -#define MAP_OR_FAIL(func) \ - if (!(__real_ ## func)) \ - { \ - __real_ ## func = dlsym(RTLD_NEXT, #func); \ - if (!(__real_ ## func)) { \ - fprintf(stderr, "UNIFYFS failed to map symbol: %s\n", #func); \ - exit(1); \ - } \ - } -#else - -/* =================================================================== - * Using ld -wrap option to intercept - * =================================================================== - * the linker will convert application calls from open --> __wrap_open, - * so we define all of our functions as the __wrap variant and then - * to call the real library, we call __real_open */ - -/* we don't need a variable to record the address of the real function, - * just declare the existence of __real_open so the compiler knows the - * prototype of this function (linker will provide it), also need to - * declare prototype for __wrap_open */ -#define UNIFYFS_DECL(name,ret,args) \ - extern ret __real_ ## name args; \ - ret __wrap_ ## name args; - -/* we define our wrapper function as __wrap_open instead of open */ -#define UNIFYFS_WRAP(name) __wrap_ ## name - -/* the linker maps the open call to __real_open() */ -#define UNIFYFS_REAL(name) __real_ ## name - -/* no need to look up the address of the real function */ -#define MAP_OR_FAIL(func) - -#endif - -/* ---------------------------------------- - * Structure and enumeration declarations - * ---------------------------------------- */ - -/* structure to represent file descriptors */ -typedef struct { - int fid; /* local file id associated with fd */ - off_t pos; /* current file pointer */ - int read; /* whether file is opened for read */ - int write; /* whether file is opened for write */ - int append; /* whether file is opened for append */ -} unifyfs_fd_t; - -enum unifyfs_stream_orientation { - UNIFYFS_STREAM_ORIENTATION_NULL = 0, - UNIFYFS_STREAM_ORIENTATION_BYTE, - UNIFYFS_STREAM_ORIENTATION_WIDE, -}; - -/* structure to represent FILE* streams */ -typedef struct { - int sid; /* index within unifyfs_streams */ - int err; /* stream error indicator flag */ - int eof; /* stream end-of-file indicator flag */ - int fd; /* file descriptor associated with stream */ - int append; /* whether file is opened in append mode */ - int orient; /* stream orientation, UNIFYFS_STREAM_ORIENTATION_{NULL,BYTE,WIDE} */ - - void* buf; /* pointer to buffer */ - int buffree; /* whether we need to free buffer */ - int buftype; /* _IOFBF fully buffered, _IOLBF line buffered, _IONBF unbuffered */ - size_t bufsize; /* size of buffer in bytes */ - off_t bufpos; /* byte offset in file corresponding to start of buffer */ - size_t buflen; /* number of bytes active in buffer */ - size_t bufdirty; /* whether data in buffer needs to be flushed */ - - unsigned char* ubuf; /* ungetc buffer (we store bytes from end) */ - size_t ubufsize; /* size of ungetc buffer in bytes */ - size_t ubuflen; /* number of active bytes in buffer */ - - unsigned char* _p; /* pointer to character in buffer */ - size_t _r; /* number of bytes left at pointer */ -} unifyfs_stream_t; - -/* structure to represent DIR* streams */ -typedef struct { - int dirid; /* index within unifyfs_dirstreams */ - int fid; /* local file id of directory for this stream */ - int fd; /* file descriptor associated with stream */ - off_t pos; /* position within directory stream */ -} unifyfs_dirstream_t; - -enum flock_enum { - UNLOCKED, - EX_LOCKED, - SH_LOCKED -}; - -enum unifyfs_file_storage { - FILE_STORAGE_NULL = 0, - FILE_STORAGE_LOGIO -}; - -typedef struct { - int fid; /* local file index in filemetas array */ - int storage; /* FILE_STORAGE type */ - - pthread_spinlock_t fspinlock; /* file lock variable */ - enum flock_enum flock_status; /* file lock status */ - - int needs_sync; /* have unsynced writes */ - struct seg_tree extents_sync; /* Segment tree containing our coalesced - * writes between sync operations */ - struct seg_tree extents; /* Segment tree of all local data extents */ - - unifyfs_file_attr_t attrs; /* UnifyFS and POSIX file attributes */ -} unifyfs_filemeta_t; - -/* struct used to map a full path to its local file id, - * an array of these is kept and a simple linear search - * is used to find a match */ -typedef struct { - /* flag incidating whether slot is in use */ - int in_use; - - /* full path and name of file */ - const char filename[UNIFYFS_MAX_FILENAME]; -} unifyfs_filename_t; - -/* This structure defines a client read request for one file corresponding to - * the global file id (gfid). It describes a contiguous read extent starting - * at offset with given length. */ -typedef struct { - /* The read request parameters */ - int gfid; /* global id of file to be read */ - size_t offset; /* logical file offset */ - size_t length; /* requested number of bytes */ - char* buf; /* user buffer to place data */ - struct aiocb* aiocbp; /* user aiocb* from aio or listio */ - - /* These two variables define the byte offset range of the extent for - * which we filled valid data. - * If cover_begin_offset != 0, there is a gap at the beginning - * of the read extent that should be zero-filled. - * If cover_end_offset != (length - 1), it was a short read. */ - volatile size_t cover_begin_offset; - volatile size_t cover_end_offset; - - /* nread is the user-visible number of bytes read. Since this includes - * any gaps, nread should be set to (cover_end_offset + 1) when the - * read request has been fully serviced. */ - size_t nread; - - /* errcode holds any error code encountered during the read. - * The error may be an internal error value (unifyfs_rc_e) or a - * normal POSIX error code. It will be converted to a valid errno value - * for use in returning from the syscall. */ - int errcode; -} read_req_t; - -typedef struct { - size_t* ptr_num_entries; - unifyfs_index_t* index_entry; -} unifyfs_index_buf_t; - - -/* ------------------------------- - * Global variable declarations - * ------------------------------- */ - - - -extern int global_rank_cnt; /* count of world ranks */ -extern int client_rank; /* client-provided rank (for debugging) */ - -extern int unifyfs_mounted; /* avoid duplicate mounts (for now) */ -extern int unifyfs_app_id; /* application (aka mountpoint) id */ -extern int unifyfs_client_id; /* client id within application */ - -extern unifyfs_index_buf_t unifyfs_indices; -extern unsigned long unifyfs_max_index_entries; - -/* log-based I/O context */ -extern logio_context* logio_ctx; - -/* whether to return UNIFYFS (true) or TMPFS (false) magic value from statfs */ -extern bool unifyfs_super_magic; - -/* keep track of what we've initialized */ -extern int unifyfs_initialized; - -/* list of file name structures of fixed length, - * used to map a full path to its local file id, - * an array of these is kept and a simple linear search - * is used to find a match */ -extern unifyfs_filename_t* unifyfs_filelist; - -/* mount directory */ -extern char* unifyfs_mount_prefix; -extern size_t unifyfs_mount_prefixlen; - -/* tracks current working directory within unifyfs directory namespace */ -extern char* unifyfs_cwd; - -/* array of file descriptors */ -extern unifyfs_fd_t unifyfs_fds[UNIFYFS_CLIENT_MAX_FILEDESCS]; -extern rlim_t unifyfs_fd_limit; - -/* array of file streams */ -extern unifyfs_stream_t unifyfs_streams[UNIFYFS_CLIENT_MAX_FILEDESCS]; - -/* array of directory streams */ -extern unifyfs_dirstream_t unifyfs_dirstreams[UNIFYFS_CLIENT_MAX_FILEDESCS]; - -/* stack of free file descriptor values, - * each is an index into unifyfs_fds array */ -extern void* unifyfs_fd_stack; - -/* stack of free streams, - * each is an index into unifyfs_streams array */ -extern void* unifyfs_stream_stack; - -/* stack of directory streams, - * each is an index into unifyfs_dirstreams array */ -extern void* unifyfs_dirstream_stack; - -/* mutex to lock stack operations */ -extern pthread_mutex_t unifyfs_stack_mutex; - -extern int unifyfs_max_files; /* maximum number of files to store */ -extern bool unifyfs_local_extents; /* enable tracking of local extents */ - -/* ------------------------------- - * Common functions - * ------------------------------- */ - -int unifyfs_init(unifyfs_cfg_t* clnt_cfg); -int unifyfs_fini(void); - -/* single function to route all unsupported wrapper calls through */ -int unifyfs_unsupported(const char* fn_name, const char* file, int line, - const char* fmt, ...); - -/* returns 1 if two input parameters will overflow their type when - * added together */ -int unifyfs_would_overflow_offt(off_t a, off_t b); - -/* returns 1 if two input parameters will overflow their type when - * added together */ -int unifyfs_would_overflow_long(long a, long b); - -int unifyfs_stack_lock(void); - -int unifyfs_stack_unlock(void); - -/* sets flag if the path should be intercept as a unifyfs path, - * and if so, writes normalized path in upath, which should - * be a buffer of size UNIFYFS_MAX_FILENAME */ -int unifyfs_intercept_path(const char* path, char* upath); - -/* given an fd, return 1 if we should intercept this file, 0 otherwise, - * convert fd to new fd value if needed */ -int unifyfs_intercept_fd(int* fd); - -/* given a FILE*, returns 1 if we should intercept this file, - * 0 otherwise */ -int unifyfs_intercept_stream(FILE* stream); - -/* given a DIR*, returns 1 if we should intercept this directory, - * 0 otherwise */ -int unifyfs_intercept_dirstream(DIR* dirp); - -/* given a path, return the file id */ -int unifyfs_get_fid_from_path(const char* path); - -/* given a file descriptor, return the file id */ -int unifyfs_get_fid_from_fd(int fd); - -/* initialze file descriptor structure corresponding to fd value */ -int unifyfs_fd_init(int fd); - -/* initialze file stream structure corresponding to id value */ -int unifyfs_stream_init(int sid); - -/* initialze directory stream descriptor structure - * corresponding to id value */ -int unifyfs_dirstream_init(int dirid); - -/* return address of file descriptor structure or NULL if fd is out - * of range */ -unifyfs_fd_t* unifyfs_get_filedesc_from_fd(int fd); - -/* given a file id, return a pointer to the meta data, - * otherwise return NULL */ -unifyfs_filemeta_t* unifyfs_get_meta_from_fid(int fid); - -/* Return 1 if fid is laminated, 0 if not */ -int unifyfs_fid_is_laminated(int fid); - -/* Return 1 if fd is laminated, 0 if not */ -int unifyfs_fd_is_laminated(int fd); - -/* Given a fid, return the path. */ -const char* unifyfs_path_from_fid(int fid); - -/* Given a fid, return a gfid */ -int unifyfs_gfid_from_fid(const int fid); - -/* returns fid for corresponding gfid, if one is active, - * returns -1 otherwise */ -int unifyfs_fid_from_gfid(const int gfid); - -/* checks to see if fid is a directory - * returns 1 for yes - * returns 0 for no */ -int unifyfs_fid_is_dir(int fid); - -/* checks to see if a directory is empty - * assumes that check for is_dir has already been made - * only checks for full path matches, does not check relative paths, - * e.g. ../dirname will not work - * returns 1 for yes it is empty - * returns 0 for no */ -int unifyfs_fid_is_dir_empty(const char* path); - -/* Return current global size of given file id */ -off_t unifyfs_fid_global_size(int fid); - -/* if we have a local fid structure corresponding to the gfid - * in question, we attempt the file lookup with the fid method - * otherwise call back to the rpc */ -off_t unifyfs_gfid_filesize(int gfid); - -/* - * Return current size of given file id. If the file is laminated, return the - * global size. Otherwise, return the local size. - */ -off_t unifyfs_fid_logical_size(int fid); - -/* Update local metadata for file from global metadata */ -int unifyfs_fid_update_file_meta(int fid, unifyfs_file_attr_t* gfattr); - -/* allocate a file id slot for a new file - * return the fid or -1 on error */ -int unifyfs_fid_alloc(void); - -/* return the file id back to the free pool */ -int unifyfs_fid_free(int fid); - -/* add a new file and initialize metadata - * returns the new fid, or negative value on error */ -int unifyfs_fid_create_file(const char* path, - int exclusive); - -/* add a new directory and initialize metadata - * returns the new fid, or a negative value on error */ -int unifyfs_fid_create_directory(const char* path); - -/* write count bytes from buf into file starting at offset pos */ -int unifyfs_fid_write( - int fid, /* local file id to write to */ - off_t pos, /* starting offset within file */ - const void* buf, /* buffer of data to be written */ - size_t count, /* number of bytes to write */ - size_t* nwritten /* returns number of bytes written */ -); - -/* truncate file id to given length, frees resources if length is - * less than size and allocates and zero-fills new bytes if length - * is more than size */ -int unifyfs_fid_truncate(int fid, off_t length); - -/* sync data for file id to server if needed */ -int unifyfs_fid_sync(int fid); - -/* opens a new file id with specified path, access flags, and permissions, - * fills outfid with file id and outpos with position for current file pointer, - * returns UNIFYFS error code */ -int unifyfs_fid_open(const char* path, int flags, mode_t mode, int* outfid, - off_t* outpos); - -int unifyfs_fid_close(int fid); - -/* unlink file and then delete its associated state */ -int unifyfs_fid_unlink(int fid); - -/* delete a file id, free its local storage resources, and return - * the file id to free stack */ -int unifyfs_fid_delete(int fid); - -/* global metadata functions */ - -int unifyfs_set_global_file_meta_from_fid(int fid, - unifyfs_file_attr_op_e op); - -int unifyfs_set_global_file_meta(int gfid, - unifyfs_file_attr_op_e op, - unifyfs_file_attr_t* gfattr); - -int unifyfs_get_global_file_meta(int gfid, - unifyfs_file_attr_t* gfattr); #endif /* UNIFYFS_INTERNAL_H */ diff --git a/client/src/unifyfs-stdio.c b/client/src/unifyfs-stdio.c index 85ccffe8c..b7206101e 100644 --- a/client/src/unifyfs-stdio.c +++ b/client/src/unifyfs-stdio.c @@ -42,8 +42,9 @@ #include "unifyfs-stdio.h" #include "unifyfs-sysio.h" +#include "posix_client.h" +#include "unifyfs_fid.h" -static int unifyfs_fpos_enabled = 1; /* whether we can use fgetpos/fsetpos */ /* --------------------------------------- * POSIX wrappers: file streams @@ -87,7 +88,7 @@ static const char* unifyfs_stream_name(FILE* fp) const char* name = NULL; int fid = unifyfs_get_fid_from_fd(s->fd); if (fid >= 0) { - name = unifyfs_filelist[fid].filename; + name = posix_client->unifyfs_filelist[fid].filename; } return name; } @@ -113,35 +114,32 @@ int unifyfs_unsupported_stream( /* determine length of string to hold formatted args */ va_list args1; va_start(args1, format); - int strlen = vsnprintf(NULL, 0, format, args1); + int len = vsnprintf(NULL, 0, format, args1); va_end(args1); /* allocate memory for string */ - int chars = strlen + 1; - char* str = (char*) malloc(chars); - if (str == NULL) { + int chars = len + 1; + char* args_str = (char*) malloc(chars); + if (args_str == NULL) { /* Error */ + return ENOMEM; } /* create the string */ va_list args2; va_start(args2, format); - vsnprintf(str, chars, format, args2); + vsnprintf(args_str, chars, format, args2); va_end(args2); /* print message */ - va_list args; - va_start(args, format); - int rc = unifyfs_unsupported( - wrap_fn, wrap_file, wrap_line, - "file %s pos %lu msg %s", name, (unsigned long) pos, str - ); - va_end(args); + unifyfs_unsupported(wrap_fn, wrap_file, wrap_line, + "file %s pos %lu msg %s", + name, (unsigned long) pos, args_str); /* free the string */ - free(str); + free(args_str); - return rc; + return 0; } int unifyfs_stream_set_pointers(unifyfs_stream_t* s) @@ -299,21 +297,25 @@ static int unifyfs_fopen( */ if (plus) { /* r+ ==> open file for update (reading and writing) */ - open_rc = unifyfs_fid_open(path, O_RDWR, perms, &fid, &pos); + open_rc = unifyfs_fid_open(posix_client, path, O_RDWR, perms, + &fid, &pos); } else { /* r ==> open file for reading */ - open_rc = unifyfs_fid_open(path, O_RDONLY, perms, &fid, &pos); + open_rc = unifyfs_fid_open(posix_client, path, O_RDONLY, perms, + &fid, &pos); } } else if (write) { if (plus) { /* w+ ==> truncate to zero length or create file for update * (read/write) */ - open_rc = unifyfs_fid_open(path, O_RDWR | O_CREAT | O_TRUNC, + open_rc = unifyfs_fid_open(posix_client, path, + O_RDWR | O_CREAT | O_TRUNC, perms, &fid, &pos); } else { /* w ==> truncate to zero length or create file for writing */ - open_rc = unifyfs_fid_open(path, O_WRONLY | O_CREAT | O_TRUNC, + open_rc = unifyfs_fid_open(posix_client, path, + O_WRONLY | O_CREAT | O_TRUNC, perms, &fid, &pos); } } else if (append) { @@ -321,11 +323,12 @@ static int unifyfs_fopen( if (plus) { /* a+ ==> append, open or create file for update, initial file * position for reading should be at start */ - open_rc = unifyfs_fid_open(path, O_RDWR | O_CREAT, + open_rc = unifyfs_fid_open(posix_client, path, O_RDWR | O_CREAT, perms, &fid, &pos); } else { /* a ==> append, open or create file for writing, at end of file */ - open_rc = unifyfs_fid_open(path, O_WRONLY | O_CREAT | O_APPEND, + open_rc = unifyfs_fid_open(posix_client, path, + O_WRONLY | O_CREAT | O_APPEND, perms, &fid, &pos); } } @@ -336,7 +339,7 @@ static int unifyfs_fopen( } /* allocate a stream for this file */ - int sid = unifyfs_stack_pop(unifyfs_stream_stack); + int sid = unifyfs_stack_pop(posix_stream_stack); if (sid < 0) { /* TODO: would like to return EMFILE to indicate * process has hit file stream limit, not the OS */ @@ -349,13 +352,13 @@ static int unifyfs_fopen( unifyfs_stream_t* s = &(unifyfs_streams[sid]); /* allocate a file descriptor for this file */ - int fd = unifyfs_stack_pop(unifyfs_fd_stack); + int fd = unifyfs_stack_pop(posix_fd_stack); if (fd < 0) { /* TODO: would like to return EMFILE to indicate * process has hit file descriptor limit, not the OS */ /* put back our stream id */ - unifyfs_stack_push(unifyfs_stream_stack, sid); + unifyfs_stack_push(posix_stream_stack, sid); /* exhausted our file descriptors */ return ENFILE; @@ -501,7 +504,7 @@ static int unifyfs_stream_flush(FILE* stream) } /* invoke fsync rpc to register index metadata with server */ - int ret = unifyfs_fid_sync(fid); + int ret = unifyfs_fid_sync_extents(posix_client, fid); if (ret != UNIFYFS_SUCCESS) { /* sync failed for some reason, set errno and return error */ s->err = 1; @@ -728,7 +731,7 @@ static int unifyfs_stream_write( errno = EBADF; return EBADF; } - current = unifyfs_fid_logical_size(fid); + current = unifyfs_fid_logical_size(posix_client, fid); /* like a seek, we discard push back bytes */ s->ubuflen = 0; @@ -942,7 +945,7 @@ static int unifyfs_fseek(FILE* stream, off_t offset, int whence) break; case SEEK_END: /* seek to EOF + offset */ - filesize = unifyfs_fid_logical_size(fid); + filesize = unifyfs_fid_logical_size(posix_client, fid); if (unifyfs_would_overflow_offt(filesize, offset)) { s->err = 1; errno = EOVERFLOW; @@ -1725,7 +1728,7 @@ int UNIFYFS_WRAP(fflush)(FILE* stream) /* flush each active unifyfs stream */ int i; - for (i = 0; i < UNIFYFS_CLIENT_MAX_FILEDESCS; i++) { + for (i = 0; i < UNIFYFS_CLIENT_MAX_FILES; i++) { /* get stream and check whether it's active */ unifyfs_stream_t* s = &(unifyfs_streams[i]); if (s->fd >= 0) { @@ -1879,7 +1882,7 @@ int UNIFYFS_WRAP(fclose)(FILE* stream) } /* close the file */ - int close_rc = unifyfs_fid_close(fid); + int close_rc = unifyfs_fid_close(posix_client, fid); if (close_rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(close_rc); return EOF; @@ -1891,13 +1894,13 @@ int UNIFYFS_WRAP(fclose)(FILE* stream) unifyfs_fd_init(s->fd); /* add file descriptor back to free stack */ - unifyfs_stack_push(unifyfs_fd_stack, s->fd); + unifyfs_stack_push(posix_fd_stack, s->fd); /* set file descriptor to -1 to indicate stream is invalid */ unifyfs_stream_init(s->sid); /* add stream back to free stack */ - unifyfs_stack_push(unifyfs_stream_stack, s->sid); + unifyfs_stack_push(posix_stream_stack, s->sid); /* currently a no-op */ return 0; diff --git a/client/src/unifyfs-stdio.h b/client/src/unifyfs-stdio.h index d653ddd42..de455a27a 100644 --- a/client/src/unifyfs-stdio.h +++ b/client/src/unifyfs-stdio.h @@ -44,6 +44,7 @@ #define UNIFYFS_STDIO_H #include "unifyfs-internal.h" +#include "unifyfs_wrap.h" /* TODO: declare the wrapper functions we define in unifyfs-stdio.c * so other routines can call them */ diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index 01d4cdb85..72d3fda35 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -40,11 +40,12 @@ * Please also read this file LICENSE.CRUISE */ -#include "unifyfs.h" -#include "unifyfs-internal.h" +#include "unifyfs.h" // for UNIFYFS_SUPER_MAGIC #include "unifyfs-sysio.h" #include "margo_client.h" +#include "posix_client.h" #include "client_read.h" +#include "unifyfs_fid.h" /* --------------------------------------- * POSIX wrappers: paths @@ -56,7 +57,7 @@ int UNIFYFS_WRAP(access)(const char* path, int mode) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* check if path exists */ - if (unifyfs_get_fid_from_path(upath) < 0) { + if (unifyfs_fid_from_path(posix_client, upath) < 0) { LOGDBG("access: unifyfs_get_id_from path failed, returning -1, %s", upath); errno = ENOENT; @@ -86,13 +87,13 @@ int UNIFYFS_WRAP(mkdir)(const char* path, mode_t mode) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* check if it already exists */ - if (unifyfs_get_fid_from_path(upath) >= 0) { + if (unifyfs_fid_from_path(posix_client, upath) >= 0) { errno = EEXIST; return -1; } /* add directory to file list */ - int ret = unifyfs_fid_create_directory(upath); + int ret = unifyfs_fid_create_directory(posix_client, upath); if (ret != UNIFYFS_SUCCESS) { /* failed to create the directory, * set errno and return */ @@ -115,7 +116,7 @@ int UNIFYFS_WRAP(mkdir)(const char* path, mode_t mode) * If mask == 0 (e.g., remove wrapper made request), the target doesn't need * any mode specific checks before attempting to remove. * */ -static int unifyfs_remove(const char* upath, mode_t mask) +static int _unifyfs_remove(const char* upath, mode_t mask) { /* check if the mount point itself is being deleted */ if (!strcmp(upath, unifyfs_mount_prefix)) { @@ -124,10 +125,10 @@ static int unifyfs_remove(const char* upath, mode_t mask) } /* check if path exists locally */ - int fid = unifyfs_get_fid_from_path(upath); + int fid = unifyfs_fid_from_path(posix_client, upath); if (fid >= 0) { /* found path locally */ - int is_dir = unifyfs_fid_is_dir(fid); + int is_dir = unifyfs_fid_is_dir(posix_client, fid); /* is it a directory? */ if (is_dir) { @@ -141,7 +142,7 @@ static int unifyfs_remove(const char* upath, mode_t mask) /* remove/rmdir likely made this request (mask & (0 | S_IFDIR)) */ /* is it empty? */ - if (!unifyfs_fid_is_dir_empty(upath)) { + if (!unifyfs_fid_is_dir_empty(posix_client, upath)) { /* ERROR: is a directory, but isn't empty */ LOGDBG("Attempting to remove non-empty dir %s in UNIFYFS", upath); @@ -159,7 +160,7 @@ static int unifyfs_remove(const char* upath, mode_t mask) } /* remove the target from the file list */ - int ret = unifyfs_fid_unlink(fid); + int ret = unifyfs_fid_unlink(posix_client, fid); if (ret != UNIFYFS_SUCCESS) { /* failed to remove the target, * set errno and return */ @@ -172,7 +173,7 @@ static int unifyfs_remove(const char* upath, mode_t mask) int gfid = unifyfs_generate_gfid(upath); unifyfs_file_attr_t attr = {0}; - int ret = unifyfs_get_global_file_meta(gfid, &attr); + int ret = unifyfs_get_global_file_meta(posix_client, gfid, &attr); if (ret != UNIFYFS_SUCCESS) { /* ERROR: path doesn't exist locally or globally */ LOGDBG("Couldn't find entry for %s in UNIFYFS", upath); @@ -206,7 +207,7 @@ static int unifyfs_remove(const char* upath, mode_t mask) } /* delete the target */ - ret = invoke_client_unlink_rpc(gfid); + ret = invoke_client_unlink_rpc(posix_client, gfid); if (ret != UNIFYFS_SUCCESS) { LOGDBG("unlink rpc failed on %s in UNIFYFS", upath); errno = unifyfs_rc_errno(ret); @@ -224,7 +225,7 @@ int UNIFYFS_WRAP(rmdir)(const char* path) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* call shared logic function with S_IFDIR mask */ - int ret = unifyfs_remove(upath, S_IFDIR); + int ret = _unifyfs_remove(upath, S_IFDIR); if (ret != UNIFYFS_SUCCESS) { LOGDBG("rmdir() failed on %s in UNIFYFS", upath); return -1; @@ -246,10 +247,10 @@ int UNIFYFS_WRAP(chdir)(const char* path) if (unifyfs_intercept_path(path, upath)) { /* TODO: check that path is not a file? */ /* we're happy to change into any directory in unifyfs */ - if (unifyfs_cwd != NULL) { - free(unifyfs_cwd); + if (posix_client->cwd != NULL) { + free(posix_client->cwd); } - unifyfs_cwd = strdup(upath); + posix_client->cwd = strdup(upath); return 0; } else { MAP_OR_FAIL(chdir); @@ -258,8 +259,8 @@ int UNIFYFS_WRAP(chdir)(const char* path) /* if the change dir was successful, * update our current working directory */ if (unifyfs_initialized && ret == 0) { - if (unifyfs_cwd != NULL) { - free(unifyfs_cwd); + if (posix_client->cwd != NULL) { + free(posix_client->cwd); } /* if we did a real chdir, let's use a real getcwd @@ -267,14 +268,14 @@ int UNIFYFS_WRAP(chdir)(const char* path) MAP_OR_FAIL(getcwd); char* cwd = UNIFYFS_REAL(getcwd)(NULL, 0); if (cwd != NULL) { - unifyfs_cwd = cwd; + posix_client->cwd = cwd; - /* parts of the code may assume unifyfs_cwd is a max size */ + /* check posix_client->cwd is within max filename length */ size_t len = strlen(cwd) + 1; if (len > UNIFYFS_MAX_FILENAME) { LOGERR("Current working dir longer (%lu bytes) " - "than UNIFYFS_MAX_FILENAME=%d", - (unsigned long) len, UNIFYFS_MAX_FILENAME); + "than UNIFYFS_MAX_FILENAME=%d", + (unsigned long) len, UNIFYFS_MAX_FILENAME); } } else { /* ERROR */ @@ -297,7 +298,7 @@ static char* _getcwd_impl(char* path, size_t size) } /* get length of current working dir */ - size_t len = strlen(unifyfs_cwd) + 1; + size_t len = strlen(posix_client->cwd) + 1; /* if user didn't provide a buffer, * we attempt to allocate and return one for them */ @@ -313,7 +314,7 @@ static char* _getcwd_impl(char* path, size_t size) /* path will fit, allocate buffer and copy */ buf = (char*) malloc(size); if (buf != NULL) { - strncpy(buf, unifyfs_cwd, size); + strncpy(buf, posix_client->cwd, size); } else { errno = ENOMEM; } @@ -329,7 +330,7 @@ static char* _getcwd_impl(char* path, size_t size) * that is big enough */ buf = (char*) malloc(len); if (buf != NULL) { - strlcpy(buf, unifyfs_cwd, len); + strlcpy(buf, posix_client->cwd, len); } else { errno = ENOMEM; } @@ -340,7 +341,7 @@ static char* _getcwd_impl(char* path, size_t size) * check that path fits in the caller's buffer */ if (len <= size) { /* current working dir fits, copy and return */ - strncpy(path, unifyfs_cwd, size); + strncpy(path, posix_client->cwd, size); return path; } else { /* user's buffer is too small */ @@ -355,17 +356,17 @@ char* UNIFYFS_WRAP(__getcwd_chk)(char* path, size_t size, size_t buflen) if (unifyfs_initialized) { /* check that we have a string, * return unusual error in case we don't */ - if (unifyfs_cwd == NULL) { + if (posix_client->cwd == NULL) { errno = EACCES; return NULL; } - /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + /* If posix_client->cwd is in unifyfs space, handle the cwd logic. * Otherwise, call the real getcwd, and if actual cwd does * not match what we expect, throw an error (the user somehow * changed dir without us noticing, so there is a bug here) */ char upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(unifyfs_cwd, upath)) { + if (unifyfs_intercept_path(posix_client->cwd, upath)) { #if 0 /* TODO: what to do here? */ if (size > buflen) { @@ -383,9 +384,9 @@ char* UNIFYFS_WRAP(__getcwd_chk)(char* path, size_t size, size_t buflen) /* check that current working dir is what we think * it should be as a sanity check */ - if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + if (ret != NULL && strcmp(posix_client->cwd, ret) != 0) { LOGERR("Expcted cwd=%s vs actual=%s", - unifyfs_cwd, ret); + posix_client->cwd, ret); } return ret; @@ -405,17 +406,17 @@ char* UNIFYFS_WRAP(getcwd)(char* path, size_t size) /* check that we have a string, * return unusual error in case we don't */ - if (unifyfs_cwd == NULL) { + if (posix_client->cwd == NULL) { errno = EACCES; return NULL; } - /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + /* If posix_client->cwd is in unifyfs space, handle the cwd logic. * Otherwise, call the real getcwd, and if actual cwd does * not match what we expect, throw an error (the user somehow * changed dir without us noticing, so there is a bug here) */ char upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(unifyfs_cwd, upath)) { + if (unifyfs_intercept_path(posix_client->cwd, upath)) { /* delegate the rest to our common getcwd function */ return _getcwd_impl(path, size); } else { @@ -426,9 +427,9 @@ char* UNIFYFS_WRAP(getcwd)(char* path, size_t size) /* check that current working dir is what we think * it should be as a sanity check */ - if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + if (ret != NULL && strcmp(posix_client->cwd, ret) != 0) { LOGERR("Expcted cwd=%s vs actual=%s", - unifyfs_cwd, ret); + posix_client->cwd, ret); } return ret; @@ -447,17 +448,17 @@ char* UNIFYFS_WRAP(getwd)(char* path) if (unifyfs_initialized) { /* check that we have a string, * return unusual error in case we don't */ - if (unifyfs_cwd == NULL) { + if (posix_client->cwd == NULL) { errno = EACCES; return NULL; } - /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + /* If posix_client->cwd is in unifyfs space, handle the cwd logic. * Otherwise, call the real getwd, and if actual cwd does * not match what we expect, throw an error (the user somehow * changed dir without us noticing, so there is a bug here) */ char upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(unifyfs_cwd, upath)) { + if (unifyfs_intercept_path(posix_client->cwd, upath)) { /* check that we got a valid path */ if (path == NULL) { errno = EINVAL; @@ -466,9 +467,9 @@ char* UNIFYFS_WRAP(getwd)(char* path) /* finally get length of current working dir and check * that it fits in the caller's buffer */ - size_t len = strlen(unifyfs_cwd) + 1; + size_t len = strlen(posix_client->cwd) + 1; if (len <= PATH_MAX) { - strncpy(path, unifyfs_cwd, PATH_MAX); + strncpy(path, posix_client->cwd, PATH_MAX); return path; } else { /* user's buffer is too small */ @@ -483,9 +484,9 @@ char* UNIFYFS_WRAP(getwd)(char* path) /* check that current working dir is what we think * it should be as a sanity check */ - if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + if (ret != NULL && strcmp(posix_client->cwd, ret) != 0) { LOGERR("Expcted cwd=%s vs actual=%s", - unifyfs_cwd, ret); + posix_client->cwd, ret); } return ret; @@ -504,20 +505,20 @@ char* UNIFYFS_WRAP(get_current_dir_name)(void) if (unifyfs_initialized) { /* check that we have a string, return unusual error * in case we don't */ - if (unifyfs_cwd == NULL) { + if (posix_client->cwd == NULL) { errno = EACCES; return NULL; } - /* If unifyfs_cwd is in unifyfs space, handle the cwd logic. + /* If posix_client->cwd is in unifyfs space, handle the cwd logic. * Otherwise, call real get_current_dir_name, and if actual cwd does * not match what we expect, throw an error (the user somehow * changed dir without us noticing, so there is a bug here) */ char upath[UNIFYFS_MAX_FILENAME]; - if (unifyfs_intercept_path(unifyfs_cwd, upath)) { + if (unifyfs_intercept_path(posix_client->cwd, upath)) { /* supposed to allocate a copy of the current working dir * and return that to caller, to be freed by caller */ - char* ret = strdup(unifyfs_cwd); + char* ret = strdup(posix_client->cwd); if (ret == NULL) { errno = ENOMEM; } @@ -530,9 +531,9 @@ char* UNIFYFS_WRAP(get_current_dir_name)(void) /* check that current working dir is what we think * it should be as a sanity check */ - if (ret != NULL && strcmp(unifyfs_cwd, ret) != 0) { + if (ret != NULL && strcmp(posix_client->cwd, ret) != 0) { LOGERR("Expcted cwd=%s vs actual=%s", - unifyfs_cwd, ret); + posix_client->cwd, ret); } return ret; @@ -562,7 +563,7 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) } /* verify that we really have a file by the old name */ - int fid = unifyfs_get_fid_from_path(old_upath); + int fid = unifyfs_fid_from_path(posix_client, old_upath); if (fid < 0) { /* ERROR: oldname does not exist */ LOGDBG("Couldn't find entry for %s in UNIFYFS", old_upath); @@ -581,7 +582,7 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) /* TODO: rename should replace existing file atomically */ /* verify that we don't already have a file by the new name */ - int newfid = unifyfs_get_fid_from_path(new_upath); + int newfid = unifyfs_fid_from_path(posix_client, new_upath); if (newfid >= 0) { /* something exists in newpath, need to delete it */ int ret = UNIFYFS_WRAP(unlink)(newpath); @@ -594,8 +595,8 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) /* finally overwrite the old name with the new name */ LOGDBG("Changing %s to %s", - (char*)&unifyfs_filelist[fid].filename, new_upath); - strcpy((void*)&unifyfs_filelist[fid].filename, new_upath); + (char*)posix_client->unifyfs_filelist[fid].filename, new_upath); + strcpy((void*)posix_client->unifyfs_filelist[fid].filename, new_upath); /* success */ return 0; @@ -621,10 +622,10 @@ int UNIFYFS_WRAP(truncate)(const char* path, off_t length) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* get file id for path name */ - int fid = unifyfs_get_fid_from_path(upath); + int fid = unifyfs_fid_from_path(posix_client, upath); if (fid >= 0) { /* got the file locally, use fid_truncate the file */ - int rc = unifyfs_fid_truncate(fid, length); + int rc = unifyfs_fid_truncate(posix_client, fid, length); if (rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(rc); return -1; @@ -635,7 +636,7 @@ int UNIFYFS_WRAP(truncate)(const char* path, off_t length) * we could fetch file attribute w/ metaget and check for such * invalid requests to avoid extra rpcs. */ int gfid = unifyfs_generate_gfid(upath); - int rc = invoke_client_truncate_rpc(gfid, length); + int rc = invoke_client_truncate_rpc(posix_client, gfid, length); if (rc != UNIFYFS_SUCCESS) { LOGDBG("truncate rpc failed %s in UNIFYFS", upath); errno = unifyfs_rc_errno(rc); @@ -658,7 +659,7 @@ int UNIFYFS_WRAP(unlink)(const char* path) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* call shared logic function with S_IFREG mask */ - int ret = unifyfs_remove(upath, S_IFREG); + int ret = _unifyfs_remove(upath, S_IFREG); if (ret != UNIFYFS_SUCCESS) { LOGDBG("unlink() failed on %s in UNIFYFS", upath); return -1; @@ -679,7 +680,7 @@ int UNIFYFS_WRAP(remove)(const char* path) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* call shared logic function with 0 mask */ - int ret = unifyfs_remove(upath, 0); + int ret = _unifyfs_remove(upath, 0); if (ret != UNIFYFS_SUCCESS) { LOGDBG("remove() failed on %s in UNIFYFS", upath); return -1; @@ -698,7 +699,7 @@ int UNIFYFS_WRAP(remove)(const char* path) static int unifyfs_get_meta_with_size(int gfid, unifyfs_file_attr_t* pfattr) { /* lookup global meta data for this file */ - int ret = unifyfs_get_global_file_meta(gfid, pfattr); + int ret = unifyfs_get_global_file_meta(posix_client, gfid, pfattr); if (ret != UNIFYFS_SUCCESS) { LOGDBG("get metadata rpc failed"); return ret; @@ -710,7 +711,7 @@ static int unifyfs_get_meta_with_size(int gfid, unifyfs_file_attr_t* pfattr) if (S_ISREG(pfattr->mode) && !pfattr->is_laminated) { /* lookup current global file size */ size_t filesize; - ret = invoke_client_filesize_rpc(gfid, &filesize); + ret = invoke_client_filesize_rpc(posix_client, gfid, &filesize); if (ret == UNIFYFS_SUCCESS) { /* success, we have a file size value */ pfattr->size = (uint64_t) filesize; @@ -736,9 +737,9 @@ static int __stat(const char* path, struct stat* buf) } /* flush any pending writes if needed */ - int fid = unifyfs_get_fid_from_path(path); + int fid = unifyfs_fid_from_path(posix_client, path); if (fid != -1) { - int sync_rc = unifyfs_fid_sync(fid); + int sync_rc = unifyfs_fid_sync_extents(posix_client, fid); if (sync_rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(sync_rc); return -1; @@ -762,7 +763,7 @@ static int __stat(const char* path, struct stat* buf) /* update local file metadata (if applicable) */ if (fid != -1) { - unifyfs_fid_update_file_meta(fid, &fattr); + unifyfs_fid_update_file_meta(posix_client, fid, &fattr); } /* copy attributes to stat struct */ @@ -792,7 +793,7 @@ int UNIFYFS_WRAP(fstat)(int fd, struct stat* buf) /* check whether we should intercept this file descriptor */ if (unifyfs_intercept_fd(&fd)) { int fid = unifyfs_get_fid_from_fd(fd); - const char* path = unifyfs_path_from_fid(fid); + const char* path = unifyfs_path_from_fid(posix_client, fid); int ret = __stat(path, buf); return ret; } else { @@ -866,7 +867,7 @@ int UNIFYFS_WRAP(__fxstat)(int vers, int fd, struct stat* buf) } int fid = unifyfs_get_fid_from_fd(fd); - const char* path = unifyfs_path_from_fid(fid); + const char* path = unifyfs_path_from_fid(posix_client, fid); int ret = __stat(path, buf); return ret; } else { @@ -891,7 +892,7 @@ static int unifyfs_statfs(struct statfs* fsbuf) memset(fsbuf, 0, sizeof(*fsbuf)); /* set file system type */ - if (unifyfs_super_magic) { + if (posix_client->use_unifyfs_magic) { /* return a magic value that is specific to UnifyFS */ fsbuf->f_type = UNIFYFS_SUPER_MAGIC; } else { @@ -904,7 +905,7 @@ static int unifyfs_statfs(struct statfs* fsbuf) //fsbuf->f_blocks = ??; /* Total data blocks in filesystem */ //fsbuf->f_bfree = ??; /* Free blocks in filesystem */ //fsbuf->f_bavail = ??; /* Free blocks available */ - fsbuf->f_files = unifyfs_max_files; /* Total file nodes */ + fsbuf->f_files = posix_client->max_files; /* Total file nodes */ //fsbuf->f_ffree = ??; /* Free file nodes in filesystem */ fsbuf->f_namelen = UNIFYFS_MAX_FILENAME; /* Max filename length */ return 0; @@ -978,7 +979,7 @@ int unifyfs_fd_read(int fd, off_t pos, void* buf, size_t count, size_t* nread) } /* it's an error to read from a directory */ - if (unifyfs_fid_is_dir(fid)) { + if (unifyfs_fid_is_dir(posix_client, fid)) { /* TODO: note that read/pread can return this, but not fread */ return EISDIR; } @@ -1003,11 +1004,11 @@ int unifyfs_fd_read(int fd, off_t pos, void* buf, size_t count, size_t* nread) /* TODO: handle error if sync fails? */ /* sync data for file before reading, if needed */ - unifyfs_fid_sync(fid); + unifyfs_fid_sync_extents(posix_client, fid); /* fill in read request */ read_req_t req; - req.gfid = unifyfs_gfid_from_fid(fid); + req.gfid = unifyfs_gfid_from_fid(posix_client, fid); req.offset = (size_t) pos; req.length = count; req.nread = 0; @@ -1018,7 +1019,7 @@ int unifyfs_fd_read(int fd, off_t pos, void* buf, size_t count, size_t* nread) req.cover_end_offset = (size_t)-1; /* execute read operation */ - int ret = process_gfid_reads(&req, 1); + int ret = process_gfid_reads(posix_client, &req, 1); if (ret != UNIFYFS_SUCCESS) { /* failed to issue read operation */ return ret; @@ -1053,7 +1054,7 @@ int unifyfs_fd_write(int fd, off_t pos, const void* buf, size_t count, } /* it's an error to write to a directory */ - if (unifyfs_fid_is_dir(fid)) { + if (unifyfs_fid_is_dir(posix_client, fid)) { return EINVAL; } @@ -1071,11 +1072,12 @@ int unifyfs_fd_write(int fd, off_t pos, const void* buf, size_t count, } /* finally write specified data to file */ - int write_rc = unifyfs_fid_write(fid, pos, buf, count, nwritten); + int write_rc = unifyfs_fid_write(posix_client, fid, pos, + buf, count, nwritten); return write_rc; } -static int unifyfs_create(char* upath, mode_t mode) +static int posix_create(char* upath, mode_t mode) { /* equivalent to open(path, O_WRONLY|O_CREAT|O_TRUNC, mode) */ @@ -1083,14 +1085,14 @@ static int unifyfs_create(char* upath, mode_t mode) int fid; int flags = O_WRONLY | O_CREAT | O_TRUNC; off_t pos; - int rc = unifyfs_fid_open(upath, flags, mode, &fid, &pos); + int rc = unifyfs_fid_open(posix_client, upath, flags, mode, &fid, &pos); if (rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(rc); return -1; } /* allocate a free file descriptor value */ - int fd = unifyfs_stack_pop(unifyfs_fd_stack); + int fd = unifyfs_stack_pop(posix_fd_stack); if (fd < 0) { /* ran out of file descriptors */ errno = EMFILE; @@ -1118,7 +1120,7 @@ int UNIFYFS_WRAP(creat)(const char* path, mode_t mode) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* TODO: handle relative paths using current working directory */ - return unifyfs_create(upath, mode); + return posix_create(upath, mode); } else { MAP_OR_FAIL(creat); int ret = UNIFYFS_REAL(creat)(path, mode); @@ -1132,7 +1134,7 @@ int UNIFYFS_WRAP(creat64)(const char* path, mode_t mode) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* TODO: handle relative paths using current working directory */ - return unifyfs_create(upath, mode); + return posix_create(upath, mode); } else { MAP_OR_FAIL(creat64); int ret = UNIFYFS_REAL(creat64)(path, mode); @@ -1160,14 +1162,15 @@ int UNIFYFS_WRAP(open)(const char* path, int flags, ...) /* create the file */ int fid; off_t pos; - int rc = unifyfs_fid_open(upath, flags, mode, &fid, &pos); + int rc = unifyfs_fid_open(posix_client, upath, flags, mode, + &fid, &pos); if (rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(rc); return -1; } /* allocate a free file descriptor value */ - int fd = unifyfs_stack_pop(unifyfs_fd_stack); + int fd = unifyfs_stack_pop(posix_fd_stack); if (fd < 0) { /* ran out of file descriptors */ errno = EMFILE; @@ -1288,7 +1291,8 @@ off_t UNIFYFS_WRAP(lseek)(int fd, off_t offset, int whence) } /* lookup meta to get file size */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(posix_client, + fid); if (meta == NULL) { /* bad file descriptor */ errno = EBADF; @@ -1324,7 +1328,7 @@ off_t UNIFYFS_WRAP(lseek)(int fd, off_t offset, int whence) break; case SEEK_END: /* seek to EOF + offset */ - logical_eof = unifyfs_fid_logical_size(fid); + logical_eof = unifyfs_fid_logical_size(posix_client, fid); if (logical_eof + offset < 0) { /* offset is negative and will result in negative position */ errno = EINVAL; @@ -1334,7 +1338,7 @@ off_t UNIFYFS_WRAP(lseek)(int fd, off_t offset, int whence) break; case SEEK_DATA: /* Using fallback approach: always return offset */ - logical_eof = unifyfs_fid_logical_size(fid); + logical_eof = unifyfs_fid_logical_size(posix_client, fid); if (offset < 0 || offset > logical_eof) { /* negative offset and offset beyond EOF are invalid */ errno = ENXIO; @@ -1344,7 +1348,7 @@ off_t UNIFYFS_WRAP(lseek)(int fd, off_t offset, int whence) break; case SEEK_HOLE: /* Using fallback approach: always return offset for EOF */ - logical_eof = unifyfs_fid_logical_size(fid); + logical_eof = unifyfs_fid_logical_size(posix_client, fid); if (offset < 0 || offset > logical_eof) { /* negative offset and offset beyond EOF are invalid */ errno = ENXIO; @@ -1508,7 +1512,7 @@ ssize_t UNIFYFS_WRAP(write)(int fd, const void* buf, size_t count) * file position. */ int fid = unifyfs_get_fid_from_fd(fd); - pos = unifyfs_fid_logical_size(fid); + pos = unifyfs_fid_logical_size(posix_client, fid); } /* write data to file */ @@ -1638,10 +1642,11 @@ int UNIFYFS_WRAP(lio_listio)(int mode, struct aiocb* const aiocb_list[], } else { /* TODO: handle error if sync fails? */ /* sync data for file before reading, if needed */ - unifyfs_fid_sync(fid); + unifyfs_fid_sync_extents(posix_client, fid); /* define read request for this file */ - reqs[reqcnt].gfid = unifyfs_gfid_from_fid(fid); + reqs[reqcnt].gfid = unifyfs_gfid_from_fid(posix_client, + fid); reqs[reqcnt].offset = (size_t)(cbp->aio_offset); reqs[reqcnt].length = cbp->aio_nbytes; reqs[reqcnt].nread = 0; @@ -1673,7 +1678,7 @@ int UNIFYFS_WRAP(lio_listio)(int mode, struct aiocb* const aiocb_list[], } if (reqcnt) { - rc = process_gfid_reads(reqs, reqcnt); + rc = process_gfid_reads(posix_client, reqs, reqcnt); if (rc != UNIFYFS_SUCCESS) { /* error reading data */ ret = rc; @@ -1722,11 +1727,11 @@ ssize_t UNIFYFS_WRAP(pread)(int fd, void* buf, size_t count, off_t offset) /* TODO: handle error if sync fails? */ /* sync data for file before reading, if needed */ - unifyfs_fid_sync(fid); + unifyfs_fid_sync_extents(posix_client, fid); /* fill in read request */ read_req_t req; - req.gfid = unifyfs_gfid_from_fid(fid); + req.gfid = unifyfs_gfid_from_fid(posix_client, fid); req.offset = offset; req.length = count; req.nread = 0; @@ -1738,7 +1743,7 @@ ssize_t UNIFYFS_WRAP(pread)(int fd, void* buf, size_t count, off_t offset) /* execute read operation */ ssize_t retcount; - int ret = process_gfid_reads(&req, 1); + int ret = process_gfid_reads(posix_client, &req, 1); if (ret != UNIFYFS_SUCCESS) { /* error reading data */ errno = unifyfs_rc_errno(ret); @@ -1834,16 +1839,16 @@ int UNIFYFS_WRAP(fchdir)(int fd) } /* lookup path for fd */ - const char* path = unifyfs_path_from_fid(fid); + const char* path = unifyfs_path_from_fid(posix_client, fid); /* TODO: test that path is not a file? */ /* we're happy to change into any directory in unifyfs * should we check that we don't change into a file at least? */ - if (unifyfs_cwd != NULL) { - free(unifyfs_cwd); + if (posix_client->cwd != NULL) { + free(posix_client->cwd); } - unifyfs_cwd = strdup(path); + posix_client->cwd = strdup(path); return 0; } else { MAP_OR_FAIL(fchdir); @@ -1852,8 +1857,8 @@ int UNIFYFS_WRAP(fchdir)(int fd) /* if the change dir was successful, * update our current working directory */ if (unifyfs_initialized && ret == 0) { - if (unifyfs_cwd != NULL) { - free(unifyfs_cwd); + if (posix_client->cwd != NULL) { + free(posix_client->cwd); } /* if we did a real chdir, let's use a real getcwd @@ -1861,19 +1866,19 @@ int UNIFYFS_WRAP(fchdir)(int fd) MAP_OR_FAIL(getcwd); char* cwd = UNIFYFS_REAL(getcwd)(NULL, 0); if (cwd != NULL) { - unifyfs_cwd = cwd; + posix_client->cwd = cwd; - /* parts of the code may assume unifyfs_cwd is a max size */ + /* some code may assume posix_client->cwd is a max size */ size_t len = strlen(cwd) + 1; if (len > UNIFYFS_MAX_FILENAME) { LOGERR("Current working dir longer (%lu bytes) " - "than UNIFYFS_MAX_FILENAME=%d", - (unsigned long) len, UNIFYFS_MAX_FILENAME); + "than UNIFYFS_MAX_FILENAME=%d", + (unsigned long) len, UNIFYFS_MAX_FILENAME); } } else { /* ERROR */ LOGERR("Failed to getcwd after fchdir(%d) errno=%d %s", - fd, errno, strerror(errno)); + fd, errno, strerror(errno)); } } @@ -1901,7 +1906,7 @@ int UNIFYFS_WRAP(ftruncate)(int fd, off_t length) } /* truncate the file */ - int rc = unifyfs_fid_truncate(fid, length); + int rc = unifyfs_fid_truncate(posix_client, fid, length); if (rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(rc); return -1; @@ -1928,7 +1933,7 @@ int UNIFYFS_WRAP(fsync)(int fd) } /* invoke fsync rpc to register index metadata with server */ - int ret = unifyfs_fid_sync(fid); + int ret = unifyfs_fid_sync_extents(posix_client, fid); if (ret != UNIFYFS_SUCCESS) { /* sync failed for some reason, set errno and return error */ errno = unifyfs_rc_errno(ret); @@ -1965,37 +1970,7 @@ int UNIFYFS_WRAP(flock)(int fd, int operation) /* check whether we should intercept this file descriptor */ if (unifyfs_intercept_fd(&fd)) { - // KMM I removed the locking code because it was causing - // hangs - /* - -- currently handling the blocking variants only - switch (operation) - { - case LOCK_EX: - LOGDBG("locking file %d", fid); - ret = pthread_spin_lock(&meta->fspinlock); - if ( ret ) { - perror("pthread_spin_lock() failed"); - return -1; - } - meta->flock_status = EX_LOCKED; - break; - case LOCK_SH: - -- not needed for CR; will not be supported, - -- update flock_status anyway - meta->flock_status = SH_LOCKED; - break; - case LOCK_UN: - ret = pthread_spin_unlock(&meta->fspinlock); - LOGDBG("unlocking file %d", fid); - meta->flock_status = UNLOCKED; - break; - default: - errno = EINVAL; - return -1; - } - */ - + // KMM I removed the locking code because it was causing hangs return 0; } else { MAP_OR_FAIL(flock); @@ -2141,7 +2116,7 @@ int UNIFYFS_WRAP(close)(int fd) /* if file was opened for writing, sync it */ if (filedesc->write) { - int sync_rc = unifyfs_fid_sync(fid); + int sync_rc = unifyfs_fid_sync_extents(posix_client, fid); if (sync_rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(sync_rc); return -1; @@ -2149,7 +2124,7 @@ int UNIFYFS_WRAP(close)(int fd) } /* close the file id */ - int close_rc = unifyfs_fid_close(fid); + int close_rc = unifyfs_fid_close(posix_client, fid); if (close_rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(close_rc); return -1; @@ -2161,7 +2136,7 @@ int UNIFYFS_WRAP(close)(int fd) unifyfs_fd_init(fd); /* add file descriptor back to free stack */ - unifyfs_stack_push(unifyfs_fd_stack, fd); + unifyfs_stack_push(posix_fd_stack, fd); return 0; } else { @@ -2177,10 +2152,10 @@ static int __chmod(int fid, mode_t mode) int ret; /* get path for printing debug messages */ - const char* path = unifyfs_path_from_fid(fid); + const char* path = unifyfs_path_from_fid(posix_client, fid); /* lookup metadata for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(posix_client, fid); if (NULL == meta) { LOGDBG("no metadata info for %s", path); errno = ENOENT; @@ -2196,7 +2171,7 @@ static int __chmod(int fid, mode_t mode) /* found file, and it's not yet laminated, * get the global file id */ - int gfid = unifyfs_gfid_from_fid(fid); + int gfid = unifyfs_gfid_from_fid(posix_client, fid); /* * If the chmod clears all the existing write bits, then it's a laminate. @@ -2207,7 +2182,7 @@ static int __chmod(int fid, mode_t mode) if ((meta->attrs.mode & 0222) && (((meta->attrs.mode & 0222) & mode) == 0)) { /* We're laminating. */ - ret = invoke_client_laminate_rpc(gfid); + ret = invoke_client_laminate_rpc(posix_client, gfid); if (ret) { LOGERR("laminate failed"); errno = unifyfs_rc_errno(ret); @@ -2221,7 +2196,7 @@ static int __chmod(int fid, mode_t mode) /* update the global meta data to reflect new permissions */ unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CHMOD; - ret = unifyfs_set_global_file_meta_from_fid(fid, op); + ret = unifyfs_set_global_file_meta_from_fid(posix_client, fid, op); if (ret) { LOGERR("chmod: can't set global meta entry for %s (fid:%d)", path, fid); @@ -2231,7 +2206,7 @@ static int __chmod(int fid, mode_t mode) /* read metadata back to pick up file size and laminated flag */ unifyfs_file_attr_t attr = {0}; - ret = unifyfs_get_global_file_meta(gfid, &attr); + ret = unifyfs_get_global_file_meta(posix_client, gfid, &attr); if (ret) { LOGERR("chmod: can't get global meta entry for %s (fid:%d)", path, fid); @@ -2243,7 +2218,7 @@ static int __chmod(int fid, mode_t mode) debug_print_file_attr(&attr); /* update global size of file from global metadata */ - unifyfs_fid_update_file_meta(fid, &attr); + unifyfs_fid_update_file_meta(posix_client, fid, &attr); return 0; } @@ -2276,7 +2251,7 @@ int UNIFYFS_WRAP(chmod)(const char* path, mode_t mode) char upath[UNIFYFS_MAX_FILENAME]; if (unifyfs_intercept_path(path, upath)) { /* check if path exists */ - int fid = unifyfs_get_fid_from_path(upath); + int fid = unifyfs_fid_from_path(posix_client, upath); if (fid < 0) { LOGDBG("chmod: unifyfs_get_id_from path failed, returning -1, %s", upath); diff --git a/client/src/unifyfs-sysio.h b/client/src/unifyfs-sysio.h index 8f7ac4f6e..1a9afa8e6 100644 --- a/client/src/unifyfs-sysio.h +++ b/client/src/unifyfs-sysio.h @@ -44,6 +44,9 @@ #define UNIFYFS_SYSIO_H #include "unifyfs-internal.h" +#include "unifyfs_wrap.h" + +#include "unifyfs-dirops.h" #define AIOCB_ERROR_CODE(cbp) (cbp->__error_code) #define AIOCB_RETURN_VAL(cbp) (cbp->__return_value) @@ -140,6 +143,4 @@ int unifyfs_fd_write( size_t* nwritten /* number of bytes written */ ); -#include "unifyfs-dirops.h" - #endif /* UNIFYFS_SYSIO_H */ diff --git a/client/src/unifyfs.c b/client/src/unifyfs.c index 1c64b695b..070d85650 100644 --- a/client/src/unifyfs.c +++ b/client/src/unifyfs.c @@ -42,7 +42,7 @@ #include "unifyfs.h" #include "unifyfs-internal.h" -#include "unifyfs-fixed.h" +#include "unifyfs_fid.h" #include "client_read.h" // client-server rpc headers @@ -54,1392 +54,101 @@ #include "spath.h" #endif /* USE_SPATH */ -/* avoid duplicate mounts (for now) */ -int unifyfs_mounted = -1; +/* list of local clients */ +static arraylist_t* client_list; /* = NULL */ -/* whether we can use fgetpos/fsetpos */ -static int unifyfs_fpos_enabled = 1; -static unifyfs_cfg_t client_cfg; - -unifyfs_index_buf_t unifyfs_indices; -static size_t unifyfs_index_buf_size; /* size of metadata log */ -unsigned long unifyfs_max_index_entries; /* max metadata log entries */ - -int global_rank_cnt; /* count of world ranks */ -int client_rank; /* client-provided rank (for debugging) */ - -int unifyfs_app_id; /* application (aka mountpoint) id */ -int unifyfs_client_id; /* client id within application */ - -static int unifyfs_use_single_shm = 0; -static int unifyfs_page_size = 0; - -/* Determine whether we automatically sync every write to server. - * This slows write performance, but it can serve as a work - * around for apps that do not have all necessary syncs. */ -static bool unifyfs_write_sync; - -static off_t unifyfs_max_offt; -static off_t unifyfs_min_offt; -static off_t unifyfs_max_long; -static off_t unifyfs_min_long; - -/* TODO: moved these to fixed file */ -int unifyfs_max_files; /* maximum number of files to store */ -bool unifyfs_local_extents; /* track data extents in client to read local */ - -/* whether to return UNIFYFS (true) or TMPFS (false) magic value from statfs */ -bool unifyfs_super_magic; - -/* log-based I/O context */ -logio_context* logio_ctx; - -/* keep track of what we've initialized */ -int unifyfs_initialized = 0; - -/* superblock - persistent shared memory region (metadata + data) */ -static shm_context* shm_super_ctx; - -/* per-file metadata */ -static void* free_fid_stack; -unifyfs_filename_t* unifyfs_filelist; -static unifyfs_filemeta_t* unifyfs_filemetas; - -/* TODO: metadata spillover is not currently supported */ -int unifyfs_spillmetablock = -1; - -/* array of file descriptors */ -unifyfs_fd_t unifyfs_fds[UNIFYFS_CLIENT_MAX_FILEDESCS]; -rlim_t unifyfs_fd_limit; - -/* array of file streams */ -unifyfs_stream_t unifyfs_streams[UNIFYFS_CLIENT_MAX_FILEDESCS]; - -/* - * TODO: the number of open directories clearly won't exceed the number of - * file descriptors. however, the current MAX_FILEDESCS value of 256 will - * quickly run out. if this value is fixed to be reasonably larger, then we - * would need a way to dynamically allocate the dirstreams instead of the - * following fixed size array. - */ - -/* array of DIR* streams to be used */ -unifyfs_dirstream_t unifyfs_dirstreams[UNIFYFS_CLIENT_MAX_FILEDESCS]; - -/* stack to track free file descriptor values, - * each is an index into unifyfs_fds array */ -void* unifyfs_fd_stack; - -/* stack to track free file streams, - * each is an index into unifyfs_streams array */ -void* unifyfs_stream_stack; - -/* stack to track free directory streams, - * each is an index into unifyfs_dirstreams array */ -void* unifyfs_dirstream_stack; - -/* mount point information */ -char* unifyfs_mount_prefix; -size_t unifyfs_mount_prefixlen = 0; - -/* to track current working directory */ -char* unifyfs_cwd; - -/* mutex to lock stack operations */ -pthread_mutex_t unifyfs_stack_mutex = PTHREAD_MUTEX_INITIALIZER; - -/* single function to route all unsupported wrapper calls through */ -int unifyfs_vunsupported( - const char* fn_name, - const char* file, - int line, - const char* fmt, - va_list args) -{ - /* print a message about where in the UNIFYFS code we are */ - printf("UNIFYFS UNSUPPORTED: %s() at %s:%d: ", fn_name, file, line); - - /* print string with more info about call, e.g., param values */ - va_list args2; - va_copy(args2, args); - vprintf(fmt, args2); - va_end(args2); - - /* TODO: optionally abort */ - - return UNIFYFS_SUCCESS; -} - -/* single function to route all unsupported wrapper calls through */ -int unifyfs_unsupported( - const char* fn_name, - const char* file, - int line, - const char* fmt, - ...) -{ - /* print string with more info about call, e.g., param values */ - va_list args; - va_start(args, fmt); - int rc = unifyfs_vunsupported(fn_name, file, line, fmt, args); - va_end(args); - return rc; -} - -/* returns 1 if two input parameters will overflow their type when - * added together */ -inline int unifyfs_would_overflow_offt(off_t a, off_t b) -{ - /* if both parameters are positive, they could overflow when - * added together */ - if (a > 0 && b > 0) { - /* if the distance between a and max is greater than or equal to - * b, then we could add a and b and still not exceed max */ - if (unifyfs_max_offt - a >= b) { - return 0; - } - return 1; - } - - /* if both parameters are negative, they could underflow when - * added together */ - if (a < 0 && b < 0) { - /* if the distance between min and a is less than or equal to - * b, then we could add a and b and still not exceed min */ - if (unifyfs_min_offt - a <= b) { - return 0; - } - return 1; - } - - /* if a and b are mixed signs or at least one of them is 0, - * then adding them together will produce a result closer to 0 - * or at least no further away than either value already is */ - return 0; -} - -/* returns 1 if two input parameters will overflow their type when - * added together */ -inline int unifyfs_would_overflow_long(long a, long b) -{ - /* if both parameters are positive, they could overflow when - * added together */ - if (a > 0 && b > 0) { - /* if the distance between a and max is greater than or equal to - * b, then we could add a and b and still not exceed max */ - if (unifyfs_max_long - a >= b) { - return 0; - } - return 1; - } - - /* if both parameters are negative, they could underflow when - * added together */ - if (a < 0 && b < 0) { - /* if the distance between min and a is less than or equal to - * b, then we could add a and b and still not exceed min */ - if (unifyfs_min_long - a <= b) { - return 0; - } - return 1; - } - - /* if a and b are mixed signs or at least one of them is 0, - * then adding them together will produce a result closer to 0 - * or at least no further away than either value already is */ - return 0; -} - -/* lock access to shared data structures in superblock */ -inline int unifyfs_stack_lock(void) -{ - if (unifyfs_use_single_shm) { - return pthread_mutex_lock(&unifyfs_stack_mutex); - } - return 0; -} - -/* unlock access to shared data structures in superblock */ -inline int unifyfs_stack_unlock(void) -{ - if (unifyfs_use_single_shm) { - return pthread_mutex_unlock(&unifyfs_stack_mutex); - } - return 0; -} - -static void unifyfs_normalize_path(const char* path, char* normalized) -{ - /* if we have a relative path, prepend the current working directory */ - if (path[0] != '/' && unifyfs_cwd != NULL) { - /* got a relative path, add our cwd */ - snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s/%s", unifyfs_cwd, path); - } else { - snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", path); - } - -#ifdef USE_SPATH - /* normalize path to handle '.', '..', - * and extra or trailing '/' characters */ - char* str = spath_strdup_reduce_str(normalized); - snprintf(normalized, UNIFYFS_MAX_FILENAME, "%s", str); - free(str); -#endif /* USE_SPATH */ -} - -/* Given a path, which may relative or absolute, - * return 1 if we should intercept the path, 0 otherwise. - * If path is to be intercepted, returned a normalized version in upath. */ -inline int unifyfs_intercept_path(const char* path, char* upath) -{ - /* don't intercept anything until we're initialized */ - if (!unifyfs_initialized) { - return 0; - } - - /* if we have a relative path, prepend the current working directory */ - char target[UNIFYFS_MAX_FILENAME]; - unifyfs_normalize_path(path, target); - - /* if the path starts with our mount point, intercept it */ - int intercept = 0; - if (strncmp(target, unifyfs_mount_prefix, unifyfs_mount_prefixlen) == 0) { - /* characters in target up through mount point match, - * assume we match */ - intercept = 1; - - /* if we have another character, it must be '/' */ - if (strlen(target) > unifyfs_mount_prefixlen && - target[unifyfs_mount_prefixlen] != '/') { - intercept = 0; - } - } - - /* copy normalized path into upath */ - if (intercept) { - strncpy(upath, target, UNIFYFS_MAX_FILENAME); - } - - return intercept; -} - -/* given an fd, return 1 if we should intercept this file, 0 otherwise, - * convert fd to new fd value if needed */ -inline int unifyfs_intercept_fd(int* fd) -{ - int oldfd = *fd; - - /* don't intercept anything until we're initialized */ - if (!unifyfs_initialized) { - return 0; - } - - if (oldfd < unifyfs_fd_limit) { - /* this fd is a real system fd, so leave it as is */ - return 0; - } else if (oldfd < 0) { - /* this is an invalid fd, so we should not intercept it */ - return 0; - } else { - /* this is an fd we generated and returned to the user, - * so intercept the call and shift the fd */ - int newfd = oldfd - unifyfs_fd_limit; - *fd = newfd; - LOGDBG("Changing fd from exposed %d to internal %d", oldfd, newfd); - return 1; - } -} - -/* given a file stream, return 1 if we should intercept this file, - * 0 otherwise */ -inline int unifyfs_intercept_stream(FILE* stream) -{ - /* don't intercept anything until we're initialized */ - if (!unifyfs_initialized) { - return 0; - } - - /* check whether this pointer lies within range of our - * file stream array */ - unifyfs_stream_t* ptr = (unifyfs_stream_t*) stream; - unifyfs_stream_t* start = unifyfs_streams; - unifyfs_stream_t* end = start + UNIFYFS_CLIENT_MAX_FILEDESCS; - if (ptr >= start && ptr < end) { - return 1; - } - - return 0; -} - -/* given an directory stream, return 1 if we should intercept this - * direcotry, 0 otherwise */ -inline int unifyfs_intercept_dirstream(DIR* dirp) -{ - /* don't intercept anything until we're initialized */ - if (!unifyfs_initialized) { - return 0; - } - - /* check whether this pointer lies within range of our - * directory stream array */ - - unifyfs_dirstream_t* ptr = (unifyfs_dirstream_t*) dirp; - unifyfs_dirstream_t* start = unifyfs_dirstreams; - unifyfs_dirstream_t* end = start + UNIFYFS_CLIENT_MAX_FILEDESCS; - if (ptr >= start && ptr < end) { - return 1; - } - - return 0; -} - -/* given a path, return the local file id, or -1 if not found */ -inline int unifyfs_get_fid_from_path(const char* path) -{ - /* scan through active entries in filelist array looking - * for a match of path */ - int i = 0; - while (i < unifyfs_max_files) { - if (unifyfs_filelist[i].in_use && - strcmp((void*)&unifyfs_filelist[i].filename, path) == 0) { - LOGDBG("File found: unifyfs_filelist[%d].filename = %s", - i, (char*)&unifyfs_filelist[i].filename); - return i; - } - i++; - } - - /* couldn't find specified path */ - return -1; -} - -/* initialize file descriptor structure for given fd value */ -int unifyfs_fd_init(int fd) -{ - /* get pointer to file descriptor struct for this fd value */ - unifyfs_fd_t* filedesc = &(unifyfs_fds[fd]); - - /* set fid to -1 to indicate fd is not active, - * set file position to max value, - * disable read and write flags */ - filedesc->fid = -1; - filedesc->pos = (off_t) -1; - filedesc->read = 0; - filedesc->write = 0; - - return UNIFYFS_SUCCESS; -} - -/* initialize file streams structure for given sid value */ -int unifyfs_stream_init(int sid) -{ - /* get pointer to file stream struct for this id value */ - unifyfs_stream_t* s = &(unifyfs_streams[sid]); - - /* record our id so when given a pointer to the stream - * struct we can easily recover our id value */ - s->sid = sid; - - /* set fd to -1 to indicate stream is not active */ - s->fd = -1; - - return UNIFYFS_SUCCESS; -} - -/* initialize directory streams structure for given dirid value */ -int unifyfs_dirstream_init(int dirid) -{ - /* get pointer to directory stream struct for this id value */ - unifyfs_dirstream_t* dirp = &(unifyfs_dirstreams[dirid]); - - /* initialize fields in structure */ - memset((void*) dirp, 0, sizeof(*dirp)); - - /* record our id so when given a pointer to the stream - * struct we can easily recover our id value */ - dirp->dirid = dirid; - - /* set fid to -1 to indicate stream is not active */ - dirp->fid = -1; - - return UNIFYFS_SUCCESS; -} - -/* given a file descriptor, return the file id */ -inline int unifyfs_get_fid_from_fd(int fd) -{ - /* check that file descriptor is within range */ - if (fd < 0 || fd >= UNIFYFS_CLIENT_MAX_FILEDESCS) { - return -1; - } - - /* get local file id that file descriptor is assocated with, - * will be -1 if not active */ - int fid = unifyfs_fds[fd].fid; - return fid; -} - -/* return address of file descriptor structure or NULL if fd is out - * of range */ -inline unifyfs_fd_t* unifyfs_get_filedesc_from_fd(int fd) -{ - if (fd >= 0 && fd < UNIFYFS_CLIENT_MAX_FILEDESCS) { - unifyfs_fd_t* filedesc = &(unifyfs_fds[fd]); - return filedesc; - } - return NULL; -} - -/* given a file id, return a pointer to the meta data, - * otherwise return NULL */ -unifyfs_filemeta_t* unifyfs_get_meta_from_fid(int fid) -{ - /* check that the file id is within range of our array */ - if (fid >= 0 && fid < unifyfs_max_files) { - /* get a pointer to the file meta data structure */ - unifyfs_filemeta_t* meta = &unifyfs_filemetas[fid]; - return meta; - } - return NULL; -} - -/* given a file id, return 1 if file is laminated, 0 otherwise */ -int unifyfs_fid_is_laminated(int fid) -{ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if ((meta != NULL) && (meta->fid == fid)) { - return meta->attrs.is_laminated; - } - return 0; -} - -/* given a file descriptor, return 1 if file is laminated, - * and 0 otherwise */ -int unifyfs_fd_is_laminated(int fd) -{ - int fid = unifyfs_get_fid_from_fd(fd); - int laminated = unifyfs_fid_is_laminated(fid); - return laminated; -} - -/* --------------------------------------- - * Operations on file storage - * --------------------------------------- */ - -/* allocate and initialize data management resource for file */ -static int fid_store_alloc(int fid) -{ - /* get meta data for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if ((meta != NULL) && (meta->fid == fid)) { - /* Initialize our segment tree that will record our writes */ - int rc = seg_tree_init(&meta->extents_sync); - if (rc != 0) { - return UNIFYFS_FAILURE; - } - - /* Initialize our segment tree to track extents for all writes - * by this process, can be used to read back local data */ - if (unifyfs_local_extents) { - rc = seg_tree_init(&meta->extents); - if (rc != 0) { - /* free off extents_sync tree we initialized */ - seg_tree_destroy(&meta->extents_sync); - return UNIFYFS_FAILURE; - } - } - - /* indicate that we're using LOGIO to store data for this file */ - meta->storage = FILE_STORAGE_LOGIO; - - return UNIFYFS_SUCCESS; - } else { - LOGERR("failed to get filemeta for fid=%d", fid); - } - - return UNIFYFS_FAILURE; -} - -/* free data management resource for file */ -static int fid_storage_free(int fid) -{ - /* get meta data for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if ((meta != NULL) && (meta->fid == fid)) { - if (meta->storage == FILE_STORAGE_LOGIO) { - /* Free our write seg_tree */ - seg_tree_destroy(&meta->extents_sync); - - /* Free our extent seg_tree */ - if (unifyfs_local_extents) { - seg_tree_destroy(&meta->extents); - } - } - - /* set storage type back to NULL */ - meta->storage = FILE_STORAGE_NULL; - - return UNIFYFS_SUCCESS; - } - - return UNIFYFS_FAILURE; -} - -/* ======================================= - * Operations on file ids - * ======================================= */ - -/* checks to see if fid is a directory - * returns 1 for yes - * returns 0 for no */ -int unifyfs_fid_is_dir(int fid) -{ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if ((meta != NULL) && (meta->attrs.mode & S_IFDIR)) { - return 1; - } - return 0; -} - -int unifyfs_gfid_from_fid(const int fid) -{ - /* check that local file id is in range */ - if (fid < 0 || fid >= unifyfs_max_files) { - return -1; - } - - /* return global file id, cached in file meta struct */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta != NULL) { - return meta->attrs.gfid; - } else { - return -1; - } -} - -/* scan list of files and return fid corresponding to target gfid, - * returns -1 if not found */ -int unifyfs_fid_from_gfid(int gfid) -{ - int i; - for (i = 0; i < unifyfs_max_files; i++) { - if (unifyfs_filelist[i].in_use && - unifyfs_filemetas[i].attrs.gfid == gfid) { - /* found a file id that's in use and it matches - * the target fid, this is the one */ - return i; - } - } - return -1; -} - -/* Given a fid, return the path. */ -const char* unifyfs_path_from_fid(int fid) -{ - unifyfs_filename_t* fname = &unifyfs_filelist[fid]; - if (fname->in_use) { - return fname->filename; - } - return NULL; -} - -/* checks to see if a directory is empty - * assumes that check for is_dir has already been made - * only checks for full path matches, does not check relative paths, - * e.g. ../dirname will not work - * returns 1 for yes it is empty - * returns 0 for no */ -int unifyfs_fid_is_dir_empty(const char* path) -{ - int i = 0; - while (i < unifyfs_max_files) { - /* only check this element if it's active */ - if (unifyfs_filelist[i].in_use) { - /* if the file starts with the path, it is inside of that directory - * also check that it's not the directory entry itself */ - char* strptr = strstr(path, unifyfs_filelist[i].filename); - if (strptr == unifyfs_filelist[i].filename && - strcmp(path, unifyfs_filelist[i].filename) != 0) { - /* found a child item in path */ - LOGDBG("File found: unifyfs_filelist[%d].filename = %s", - i, (char*)&unifyfs_filelist[i].filename); - return 0; - } - } - - /* go on to next file */ - i++; - } - - /* couldn't find any files with this prefix, dir must be empty */ - return 1; -} - -/* Return the global (laminated) size of the file */ -off_t unifyfs_fid_global_size(int fid) -{ - /* get meta data for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta != NULL) { - return meta->attrs.size; - } - return (off_t)-1; -} - -/* - * Return the size of the file. If the file is laminated, return the - * laminated size. If the file is not laminated, return the local - * size. - */ -off_t unifyfs_fid_logical_size(int fid) -{ - /* get meta data for this file */ - if (unifyfs_fid_is_laminated(fid)) { - off_t size = unifyfs_fid_global_size(fid); - return size; - } else { - /* invoke an rpc to ask the server what the file size is */ - - /* sync any writes to disk before requesting file size */ - unifyfs_fid_sync(fid); - - /* get file size for this file */ - size_t filesize; - int gfid = unifyfs_gfid_from_fid(fid); - int ret = invoke_client_filesize_rpc(gfid, &filesize); - if (ret != UNIFYFS_SUCCESS) { - /* failed to get file size */ - return (off_t)-1; - } - return (off_t)filesize; - } -} - -/* if we have a local fid structure corresponding to the gfid - * in question, we attempt the file lookup with the fid method - * otherwise call back to the rpc */ -off_t unifyfs_gfid_filesize(int gfid) -{ - off_t filesize = (off_t)-1; - - /* see if we have a fid for this gfid */ - int fid = unifyfs_fid_from_gfid(gfid); - if (fid >= 0) { - /* got a fid, look up file size through that - * method, since it may avoid a server rpc call */ - filesize = unifyfs_fid_logical_size(fid); - } else { - /* no fid for this gfid, - * look it up with server rpc */ - size_t size; - int ret = invoke_client_filesize_rpc(gfid, &size); - if (ret == UNIFYFS_SUCCESS) { - /* got the file size successfully */ - filesize = size; - } - } - - return filesize; -} - -/* Update local metadata for file from global metadata */ -int unifyfs_fid_update_file_meta(int fid, unifyfs_file_attr_t* gfattr) -{ - if (NULL == gfattr) { - return EINVAL; - } - - /* lookup local metadata for file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - if (meta != NULL) { - meta->attrs = *gfattr; - return UNIFYFS_SUCCESS; - } - - /* else, bad fid */ - return EINVAL; -} - -/* - * Set the metadata values for a file (after optionally creating it). - * The gfid for the file is in f_meta->gfid. - * - * gfid: The global file id on which to set metadata. - * - * op: If set to FILE_ATTR_OP_CREATE, attempt to create the file first. - * If the file already exists, then update its metadata with the values - * from fid filemeta. If not creating and the file does not exist, - * then the server will return an error. - * - * gfattr: The metadata values to store. - */ -int unifyfs_set_global_file_meta( - int gfid, - unifyfs_file_attr_op_e attr_op, - unifyfs_file_attr_t* gfattr) -{ - /* check that we have an input buffer */ - if (NULL == gfattr) { - return UNIFYFS_FAILURE; - } - - /* force the gfid field value to match the gfid we're - * submitting this under */ - gfattr->gfid = gfid; - - /* send file attributes to server */ - int ret = invoke_client_metaset_rpc(attr_op, gfattr); - return ret; -} - -int unifyfs_get_global_file_meta(int gfid, unifyfs_file_attr_t* gfattr) -{ - /* check that we have an output buffer to write to */ - if (NULL == gfattr) { - return UNIFYFS_FAILURE; - } - - /* attempt to lookup file attributes in key/value store */ - unifyfs_file_attr_t fmeta; - int ret = invoke_client_metaget_rpc(gfid, &fmeta); - if (ret == UNIFYFS_SUCCESS) { - /* found it, copy attributes to output struct */ - *gfattr = fmeta; - } - return ret; -} - -/* - * Set the global metadata values for a file using local file - * attributes associated with the given local file id. - * - * fid: The local file id on which to base global metadata values. - * - * op: If set to FILE_ATTR_OP_CREATE, attempt to create the file first. - * If the file already exists, then update its metadata with the values - * from fid filemeta. If not creating and the file does not exist, - * then the server will return an error. - */ -int unifyfs_set_global_file_meta_from_fid(int fid, unifyfs_file_attr_op_e op) -{ - /* initialize an empty file attributes structure */ - unifyfs_file_attr_t fattr; - unifyfs_file_attr_set_invalid(&fattr); - - /* lookup local metadata for file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - assert(meta != NULL); - - /* set global file id */ - fattr.gfid = meta->attrs.gfid; - - LOGDBG("setting global file metadata for fid:%d gfid:%d path:%s", - fid, fattr.gfid, meta->attrs.filename); - - unifyfs_file_attr_update(op, &fattr, &(meta->attrs)); - - LOGDBG("using following attributes"); - debug_print_file_attr(&fattr); - - /* submit file attributes to global key/value store */ - int ret = unifyfs_set_global_file_meta(fattr.gfid, op, &fattr); - return ret; -} - -/* allocate a file id slot for a new file - * return the fid or -1 on error */ -int unifyfs_fid_alloc(void) -{ - unifyfs_stack_lock(); - int fid = unifyfs_stack_pop(free_fid_stack); - unifyfs_stack_unlock(); - LOGDBG("unifyfs_stack_pop() gave %d", fid); - if (fid < 0) { - /* need to create a new file, but we can't */ - LOGERR("unifyfs_stack_pop() failed (%d)", fid); - return -EMFILE; - } - return fid; -} - -/* return the file id back to the free pool */ -int unifyfs_fid_free(int fid) -{ - unifyfs_stack_lock(); - unifyfs_stack_push(free_fid_stack, fid); - unifyfs_stack_unlock(); - return UNIFYFS_SUCCESS; -} - -/* add a new file and initialize metadata - * returns the new fid, or negative value on error */ -int unifyfs_fid_create_file(const char* path, - int exclusive) -{ - /* check that pathname is within bounds */ - size_t pathlen = strlen(path) + 1; - if (pathlen > UNIFYFS_MAX_FILENAME) { - return -ENAMETOOLONG; - } - - /* allocate an id for this file */ - int fid = unifyfs_fid_alloc(); - if (fid < 0) { - return fid; - } - - /* mark this slot as in use */ - unifyfs_filelist[fid].in_use = 1; - - /* copy file name into slot */ - strlcpy((void*)&unifyfs_filelist[fid].filename, path, UNIFYFS_MAX_FILENAME); - LOGDBG("Filename %s got unifyfs fid %d", - unifyfs_filelist[fid].filename, fid); - - /* get metadata for this file id */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - assert(meta != NULL); - - /* initialize file attributes */ - unifyfs_file_attr_set_invalid(&(meta->attrs)); - meta->attrs.gfid = unifyfs_generate_gfid(path); - meta->attrs.size = 0; - meta->attrs.mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; - meta->attrs.is_laminated = 0; - meta->attrs.is_shared = !exclusive; - meta->attrs.filename = (char*)&(unifyfs_filelist[fid].filename); - - /* use client user/group */ - meta->attrs.uid = getuid(); - meta->attrs.gid = getgid(); - - /* use current time for atime/mtime/ctime */ - struct timespec tp = {0}; - clock_gettime(CLOCK_REALTIME, &tp); - meta->attrs.atime = tp; - meta->attrs.mtime = tp; - meta->attrs.ctime = tp; - - /* set UnifyFS client metadata */ - meta->fid = fid; - meta->storage = FILE_STORAGE_NULL; - meta->needs_sync = 0; - - /* PTHREAD_PROCESS_SHARED allows Process-Shared Synchronization */ - meta->flock_status = UNLOCKED; - pthread_spin_init(&meta->fspinlock, PTHREAD_PROCESS_SHARED); - - return fid; -} - -/* create directory state for given path. returns success|error */ -int unifyfs_fid_create_directory(const char* path) -{ - /* check that pathname is within bounds */ - size_t pathlen = strlen(path) + 1; - if (pathlen > UNIFYFS_MAX_FILENAME) { - return ENAMETOOLONG; - } - - /* get local and global file ids */ - int fid = unifyfs_get_fid_from_path(path); - int gfid = unifyfs_generate_gfid(path); - - /* test whether we have info for file in our local file list */ - int found_local = (fid != -1); - - /* test whether we have metadata for file in global key/value store */ - int found_global = 0; - unifyfs_file_attr_t gfattr = { 0 }; - int rc = unifyfs_get_global_file_meta(gfid, &gfattr); - if (UNIFYFS_SUCCESS == rc) { - found_global = 1; - } - - if (found_local && !found_global) { - /* exists locally, but not globally - * - * FIXME: so, we have detected the cache inconsistency here. - * we cannot simply unlink or remove the entry because then we also - * need to check whether any subdirectories or files exist. - * - * this can happen when - * - a process created a directory. this process (A) has opened it at - * least once. - * - then, the directory has been deleted by another process (B). it - * deletes the global entry without checking any local used entries - * in other processes. - * - * we currently return EEXIST, and this needs to be addressed according - * to a consistency model this fs intance assumes. - */ - return EEXIST; - } - - /* now, we need to create a new directory. we reuse the file creation - * method and then update the mode to indicate it's a directory */ - if (!found_local) { - /* create a new file */ - fid = unifyfs_fid_create_file(path, 0); - if (fid < 0) { - /* convert negative error code to positive */ - return -fid; - } - - /* mark it as a directory */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - assert(meta != NULL); - meta->attrs.mode = (meta->attrs.mode & ~S_IFREG) | S_IFDIR; - - if (!found_global) { - /* insert global meta data for directory */ - unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; - rc = unifyfs_set_global_file_meta_from_fid(fid, op); - if (rc != UNIFYFS_SUCCESS) { - if (rc != EEXIST) { - LOGERR("Failed to add global metadata for dir %s (rc=%d)", - path, rc); - return rc; - } /* else, someone else created global metadata first */ - } - } - } - - return UNIFYFS_SUCCESS; -} - -/* delete a file id, free its local storage resources and return - * the file id to free stack */ -int unifyfs_fid_delete(int fid) -{ - /* finalize the storage we're using for this file */ - int rc = fid_storage_free(fid); - if (rc != UNIFYFS_SUCCESS) { - /* failed to release structures tracking storage, - * bail out to keep its file id active */ - return rc; - } - - /* set this file id as not in use */ - unifyfs_filelist[fid].in_use = 0; - - /* add this id back to the free stack */ - rc = unifyfs_fid_free(fid); - if (rc != UNIFYFS_SUCCESS) { - /* storage for the file was released, but we hit - * an error while freeing the file id */ - return rc; - } - - return UNIFYFS_SUCCESS; -} - -/* Write count bytes from buf into file starting at offset pos. - * - * Returns UNIFYFS_SUCCESS, or an error code - */ -int unifyfs_fid_write( - int fid, /* local file id to write to */ - off_t pos, /* starting position in file */ - const void* buf, /* buffer to be written */ - size_t count, /* number of bytes to write */ - size_t* nwritten) /* returns number of bytes written */ -{ - int rc; - - /* assume we won't write anything */ - *nwritten = 0; - - /* short-circuit a 0-byte write */ - if (count == 0) { - return UNIFYFS_SUCCESS; - } - - /* get meta for this file id */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - assert(meta != NULL); - - if (meta->attrs.is_laminated) { - /* attempt to write to laminated file, return read-only filesystem */ - return EROFS; - } - - /* determine storage type to write file data */ - if (meta->storage == FILE_STORAGE_LOGIO) { - /* file stored in logged i/o */ - rc = unifyfs_fid_logio_write(fid, meta, pos, buf, count, nwritten); - if (rc == UNIFYFS_SUCCESS) { - /* write succeeded, remember that we have new data - * that needs to be synced with the server */ - meta->needs_sync = 1; - - /* optionally sync after every write */ - if (unifyfs_write_sync) { - int ret = unifyfs_sync_extents(fid); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("client sync after write failed"); - rc = ret; - } - } - } - } else { - /* unknown storage type */ - LOGERR("unknown storage type for fid=%d", fid); - rc = EIO; - } - - return rc; -} - -/* truncate file id to given length, frees resources if length is - * less than size and allocates and zero-fills new bytes if length - * is more than size */ -int unifyfs_fid_truncate(int fid, off_t length) -{ - /* get meta data for this file */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - assert(meta != NULL); - - /* truncate is not valid for directories */ - if (S_ISDIR(meta->attrs.mode)) { - return EISDIR; - } - - if (meta->attrs.is_laminated) { - /* Can't truncate a laminated file */ - return EINVAL; - } - - if (meta->storage != FILE_STORAGE_LOGIO) { - /* unknown storage type */ - return EIO; - } - - /* remove/update writes past truncation size for this file id */ - int rc = truncate_write_meta(meta, length); - if (rc != UNIFYFS_SUCCESS) { - return rc; - } - - /* truncate is a sync point */ - rc = unifyfs_fid_sync(fid); - if (rc != UNIFYFS_SUCCESS) { - return rc; - } - - /* update global size in filemeta to reflect truncated size. - * note that log size is not affected */ - meta->attrs.size = length; - - /* invoke truncate rpc */ - int gfid = unifyfs_gfid_from_fid(fid); - rc = invoke_client_truncate_rpc(gfid, length); - if (rc != UNIFYFS_SUCCESS) { - return rc; - } - - return UNIFYFS_SUCCESS; +/* lock access to shared data structures in superblock */ +int unifyfs_stack_lock(unifyfs_client* client) +{ + return 0; } -/* sync data for file id to server if needed */ -int unifyfs_fid_sync(int fid) +/* unlock access to shared data structures in superblock */ +int unifyfs_stack_unlock(unifyfs_client* client) { - /* assume we'll succeed */ - int ret = UNIFYFS_SUCCESS; - - /* get metadata for the file id */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); - assert(meta != NULL); - - /* sync data with server */ - if (meta->needs_sync) { - ret = unifyfs_sync_extents(fid); - } - - return ret; + return 0; } -/* opens a new file id with specified path, access flags, and permissions, - * fills outfid with file id and outpos with position for current file pointer, - * returns UNIFYFS error code - */ -int unifyfs_fid_open( - const char* path, /* path of file to be opened */ - int flags, /* flags bits as from open(2) */ - mode_t mode, /* mode bits as from open(2) */ - int* outfid, /* allocated local file id if open is successful */ - off_t* outpos) /* initial file position if open is successful */ +/* get current file size. if we have a local file corresponding to the + * given gfid, we use the local metadata. otherwise, we use a global + * metadata lookup */ +off_t unifyfs_gfid_filesize(unifyfs_client* client, + int gfid) { - int ret; - - /* set the pointer to the start of the file */ - off_t pos = 0; - - /* check that pathname is within bounds */ - size_t pathlen = strlen(path) + 1; - if (pathlen > UNIFYFS_MAX_FILENAME) { - return ENAMETOOLONG; - } - - /* - * TODO: The test of file existence involves both local and global checks. - * However, the testing below does not seem to cover all cases. For - * instance, a globally unlinked file might be still cached locally because - * the broadcast for cache invalidation has not been implemented, yet. - */ - - /* look for local and global file ids */ - int fid = unifyfs_get_fid_from_path(path); - int gfid = unifyfs_generate_gfid(path); - LOGDBG("unifyfs_get_fid_from_path() gave %d (gfid = %d)", fid, gfid); - - /* test whether we have info for file in our local file list */ - int found_local = (fid >= 0); - - /* determine whether any write flags are specified */ - int open_for_write = flags & (O_RDWR | O_WRONLY); - - int exclusive = flags & O_EXCL; - - /* struct to hold global metadata for file */ - unifyfs_file_attr_t gfattr = { 0, }; - - /* if O_CREAT, - * if not local, allocate fid and storage - * create from local fid meta - * attempt to create global inode - * if EEXIST and O_EXCL, error and release fid/storage - * lookup global meta - * check that local and global info are consistent - * if O_TRUNC and not laminated, truncate - * else - * lookup global meta - * if not found, error - * check that local and global info are consistent - * if O_APPEND, set pos to file size - */ - - /* flag indicating whether file should be truncated */ - int need_truncate = 0; - - /* determine whether we are creating a new file - * or opening an existing one */ - if (flags & O_CREAT) { - /* user wants to create a new file, - * allocate a local file id structure if needed */ - if (!found_local) { - /* initialize local metadata for this file */ - fid = unifyfs_fid_create_file(path, exclusive); - if (fid < 0) { - LOGERR("failed to create a new file %s", path); - return -fid; - } - - /* initialize local storage for this file */ - ret = fid_store_alloc(fid); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("failed to allocate storage space for file %s (fid=%d)", - path, fid); - unifyfs_fid_delete(fid); - return ret; - } - - /* TODO: set meta->mode bits to mode variable */ - } - - /* insert file attribute for file in key-value store */ - unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; - ret = unifyfs_set_global_file_meta_from_fid(fid, op); - if (ret == EEXIST && !exclusive) { - /* File didn't exist before, but now it does. - * Another process beat us to the punch in creating it. - * Read its metadata to update our cache. */ - ret = unifyfs_get_global_file_meta(gfid, &gfattr); - if (ret == UNIFYFS_SUCCESS) { - if (found_local) { - /* TODO: check that global metadata is consistent with - * our existing local entry */ - } - - /* Successful in fetching metadata for existing file. - * Update our local cache using that metadata. */ - unifyfs_fid_update_file_meta(fid, &gfattr); - } else { - /* Failed to get metadata for a file that should exist. - * Perhaps it was since deleted. We could try to create - * it again and loop through these steps, but for now - * consider this situation to be an error. */ - LOGERR("Failed to get metadata on existing file %s (fid:%d)", - path, fid); - } + off_t filesize = (off_t)-1; - /* check for truncate if the file exists already */ - if ((flags & O_TRUNC) && open_for_write && !gfattr.is_laminated) { - need_truncate = 1; - } - } - if (ret != UNIFYFS_SUCCESS) { - LOGERR("Failed to populate the global meta entry for %s (fid:%d)", - path, fid); - if (!found_local) { - /* free fid we just allocated above, - * but don't do that by calling fid_unlink */ - unifyfs_fid_delete(fid); - } - return ret; - } + /* see if we have a fid for this gfid */ + int fid = unifyfs_fid_from_gfid(client, gfid); + if (fid >= 0) { + /* got a fid, look up file size through that + * method, since it may avoid a server rpc call */ + filesize = unifyfs_fid_logical_size(client, fid); } else { - /* trying to open without creating, file must already exist, - * lookup global metadata for file */ - ret = unifyfs_get_global_file_meta(gfid, &gfattr); - if (ret != UNIFYFS_SUCCESS) { - /* bail out if we failed to find global file */ - if (found_local && ret == ENOENT) { - /* Have a local entry, but there is no global entry. - * Perhaps global file was unlinked? - * Invalidate our local entry. */ - LOGDBG("file found locally, but seems to be deleted globally. " - "invalidating the local cache."); - unifyfs_fid_delete(fid); - } - - return ret; - } - - /* succeeded in global lookup for file, - * allocate a local file id structure if needed */ - if (!found_local) { - /* initialize local metadata for this file */ - fid = unifyfs_fid_create_file(path, 0); - if (fid < 0) { - LOGERR("failed to create a new file %s", path); - return -fid; - } - - /* initialize local storage for this file */ - ret = fid_store_alloc(fid); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("failed to allocate storage space for file %s (fid=%d)", - path, fid); - /* free fid we just allocated above, - * but don't do that by calling fid_unlink */ - unifyfs_fid_delete(fid); - return ret; - } - } else { - /* TODO: already have a local entry for this path and found - * a global entry, check that they are consistent */ - } - - /* Successful in fetching metadata for existing file. - * Update our local cache using that metadata. */ - unifyfs_fid_update_file_meta(fid, &gfattr); - - /* check if we need to truncate the existing file */ - if ((flags & O_TRUNC) && open_for_write && !gfattr.is_laminated) { - need_truncate = 1; - } - } - - /* if given O_DIRECTORY, the named file must be a directory */ - if ((flags & O_DIRECTORY) && !unifyfs_fid_is_dir(fid)) { - if (!found_local) { - /* free fid we just allocated above, - * but don't do that by calling fid_unlink */ - unifyfs_fid_delete(fid); - } - return ENOTDIR; - } - - /* TODO: does O_DIRECTORY really have to be given to open a directory? */ - if (!(flags & O_DIRECTORY) && unifyfs_fid_is_dir(fid)) { - if (!found_local) { - /* free fid we just allocated above, - * but don't do that by calling fid_unlink */ - unifyfs_fid_delete(fid); - } - return EISDIR; - } - - /* - * Catch any case where we could potentially want to write to a laminated - * file. - */ - if (gfattr.is_laminated && - ((flags & (O_CREAT | O_TRUNC | O_APPEND | O_WRONLY)) || - ((mode & 0222) && (flags != O_RDONLY)))) { - LOGDBG("Can't open laminated file %s with a writable flag.", path); - /* TODO: free fid we just allocated above, - * but don't do that by calling fid_unlink */ - if (!found_local) { - /* free fid we just allocated above, - * but don't do that by calling fid_unlink */ - unifyfs_fid_delete(fid); - } - return EROFS; - } - - /* truncate the file, if we have to */ - if (need_truncate) { - ret = unifyfs_fid_truncate(fid, 0); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("Failed to truncate the file %s", path); - return ret; + /* no fid for this gfid, + * look it up with server rpc */ + size_t size; + int ret = invoke_client_filesize_rpc(client, gfid, &size); + if (ret == UNIFYFS_SUCCESS) { + /* got the file size successfully */ + filesize = size; } } - /* do we normally update position to EOF with O_APPEND? */ - if ((flags & O_APPEND) && open_for_write) { - /* We only support O_APPEND on non-laminated files */ - pos = unifyfs_fid_logical_size(fid); - } - - /* return local file id and starting file position */ - *outfid = fid; - *outpos = pos; - - return UNIFYFS_SUCCESS; + return filesize; } -int unifyfs_fid_close(int fid) +/* + * Set the metadata values for a file (after optionally creating it). + * The gfid for the file is in f_meta->gfid. + * + * gfid: The global file id on which to set metadata. + * + * op: If set to FILE_ATTR_OP_CREATE, attempt to create the file first. + * If the file already exists, then update its metadata with the values + * from fid filemeta. If not creating and the file does not exist, + * then the server will return an error. + * + * gfattr: The metadata values to store. + */ +int unifyfs_set_global_file_meta(unifyfs_client* client, + int gfid, + unifyfs_file_attr_op_e attr_op, + unifyfs_file_attr_t* gfattr) { - /* TODO: clear any held locks */ + /* check that we have an input buffer */ + if (NULL == gfattr) { + return UNIFYFS_FAILURE; + } - /* nothing to do here, just a place holder */ - return UNIFYFS_SUCCESS; + /* force the gfid field value to match the gfid we're + * submitting this under */ + gfattr->gfid = gfid; + + /* send file attributes to server */ + int ret = invoke_client_metaset_rpc(client, attr_op, gfattr); + return ret; } -/* unlink file and then delete its associated state */ -int unifyfs_fid_unlink(int fid) +int unifyfs_get_global_file_meta(unifyfs_client* client, + int gfid, + unifyfs_file_attr_t* gfattr) { - int rc; - - /* invoke unlink rpc */ - int gfid = unifyfs_gfid_from_fid(fid); - rc = invoke_client_unlink_rpc(gfid); - if (rc != UNIFYFS_SUCCESS) { - /* TODO: if item does not exist globally, but just locally, - * we still want to delete item locally */ - return rc; + /* check that we have an output buffer to write to */ + if (NULL == gfattr) { + return UNIFYFS_FAILURE; } - /* finalize the storage we're using for this file */ - rc = unifyfs_fid_delete(fid); - if (rc != UNIFYFS_SUCCESS) { - /* released storage for file, but failed to release - * structures tracking storage, again bail out to keep - * its file id active */ - return rc; + /* attempt to lookup file attributes in key/value store */ + unifyfs_file_attr_t fmeta; + int ret = invoke_client_metaget_rpc(client, gfid, &fmeta); + if (ret == UNIFYFS_SUCCESS) { + /* found it, copy attributes to output struct */ + *gfattr = fmeta; } - - return UNIFYFS_SUCCESS; + return ret; } -/* ======================================= - * Operations to mount/unmount file system - * ======================================= */ - /* ------------- * static APIs * ------------- */ @@ -1448,7 +157,7 @@ int unifyfs_fid_unlink(int fid) * persist file system meta data. It also contains a fixed-size * region for keeping log index entries for each file. * - * - stack of free local file ids of length max_files, + * - stack of free local file ids of length client->max_files, * the local file id is used to index into other data * structures * @@ -1461,14 +170,14 @@ int unifyfs_fid_unlink(int fid) * * - count of number of active index entries * - array of index metadata to track physical offset - * of logical file data, of length unifyfs_max_index_entries, + * of logical file data, of length max_write_index_entries, * entries added during write operations */ /* compute memory size of superblock in bytes, * critical to keep this consistent with * init_superblock_pointers */ -static size_t get_superblock_size(void) +static size_t get_superblock_size(unifyfs_client* client) { size_t sb_size = 0; @@ -1477,17 +186,17 @@ static size_t get_superblock_size(void) sb_size += sizeof(uint32_t); /* free file id stack */ - sb_size += unifyfs_stack_bytes(unifyfs_max_files); + sb_size += unifyfs_stack_bytes(client->max_files); /* file name struct array */ - sb_size += unifyfs_max_files * sizeof(unifyfs_filename_t); + sb_size += client->max_files * sizeof(unifyfs_filename_t); /* file metadata struct array */ - sb_size += unifyfs_max_files * sizeof(unifyfs_filemeta_t); + sb_size += client->max_files * sizeof(unifyfs_filemeta_t); /* index region size */ - sb_size += unifyfs_page_size; - sb_size += unifyfs_max_index_entries * sizeof(unifyfs_index_t); + sb_size += get_page_size(); + sb_size += client->max_write_index_entries * sizeof(unifyfs_index_t); /* return number of bytes */ return sb_size; @@ -1496,67 +205,74 @@ static size_t get_superblock_size(void) static inline char* next_page_align(char* ptr) { + size_t pgsz = get_page_size(); intptr_t orig = (intptr_t) ptr; intptr_t aligned = orig; - intptr_t offset = orig % unifyfs_page_size; + intptr_t offset = orig % pgsz; if (offset) { - aligned += (unifyfs_page_size - offset); + aligned += (pgsz - offset); } LOGDBG("orig=0x%p, next-page-aligned=0x%p", ptr, (char*)aligned); return (char*) aligned; } /* initialize our global pointers into the given superblock */ -static void init_superblock_pointers(void* superblock) +static void init_superblock_pointers(unifyfs_client* client, + void* superblock) { - char* ptr = (char*)superblock; + char* super = (char*) superblock; + char* ptr = super; /* jump over header (right now just a uint32_t to record * magic value of 0xdeadbeef if initialized */ ptr += sizeof(uint32_t); /* stack to manage free file ids */ - free_fid_stack = ptr; - ptr += unifyfs_stack_bytes(unifyfs_max_files); + client->free_fid_stack = ptr; + ptr += unifyfs_stack_bytes(client->max_files); /* record list of file names */ - unifyfs_filelist = (unifyfs_filename_t*)ptr; - ptr += unifyfs_max_files * sizeof(unifyfs_filename_t); + client->unifyfs_filelist = (unifyfs_filename_t*)ptr; + ptr += client->max_files * sizeof(unifyfs_filename_t); /* array of file meta data structures */ - unifyfs_filemetas = (unifyfs_filemeta_t*)ptr; - ptr += unifyfs_max_files * sizeof(unifyfs_filemeta_t); - - /* record pointer to number of index entries */ - unifyfs_indices.ptr_num_entries = (size_t*)ptr; - - /* pointer to array of index entries */ - ptr += unifyfs_page_size; - unifyfs_indices.index_entry = (unifyfs_index_t*)ptr; - ptr += unifyfs_max_index_entries * sizeof(unifyfs_index_t); + client->unifyfs_filemetas = (unifyfs_filemeta_t*)ptr; + ptr += client->max_files * sizeof(unifyfs_filemeta_t); + + /* record pointers to number of index entries and entries array */ + size_t pgsz = get_page_size(); + size_t entries_size = + client->max_write_index_entries * sizeof(unifyfs_index_t); + size_t index_size = pgsz + entries_size; + + client->state.write_index.index_offset = (size_t)(ptr - super); + client->state.write_index.index_size = index_size; + client->state.write_index.ptr_num_entries = (size_t*)ptr; + ptr += pgsz; + client->state.write_index.index_entries = (unifyfs_index_t*)ptr; + ptr += client->max_write_index_entries * sizeof(unifyfs_index_t); /* compute size of memory we're using and check that * it matches what we allocated */ size_t ptr_size = (size_t)(ptr - (char*)superblock); - if (ptr_size > shm_super_ctx->size) { + if (ptr_size > client->state.shm_super_ctx->size) { LOGERR("Data structures in superblock extend beyond its size"); } } /* initialize data structures for first use */ -static int init_superblock_structures(void) +static int init_superblock_structures(unifyfs_client* client) { - int i; - for (i = 0; i < unifyfs_max_files; i++) { + for (int i = 0; i < client->max_files; i++) { /* indicate that file id is not in use by setting flag to 0 */ - unifyfs_filelist[i].in_use = 0; + client->unifyfs_filelist[i].in_use = 0; } /* initialize stack of free file ids */ - unifyfs_stack_init(free_fid_stack, unifyfs_max_files); + unifyfs_stack_init(client->free_fid_stack, client->max_files); /* initialize count of key/value entries */ - *(unifyfs_indices.ptr_num_entries) = 0; + *(client->state.write_index.ptr_num_entries) = 0; LOGDBG("Meta-stacks initialized!"); @@ -1565,22 +281,24 @@ static int init_superblock_structures(void) /* create superblock of specified size and name, or attach to existing * block if available */ -static int init_superblock_shm(size_t super_sz) +static int init_superblock_shm(unifyfs_client* client, + size_t super_sz) { char shm_name[SHMEM_NAME_LEN] = {0}; /* attach shmem region for client's superblock */ - sprintf(shm_name, SHMEM_SUPER_FMTSTR, unifyfs_app_id, unifyfs_client_id); + sprintf(shm_name, SHMEM_SUPER_FMTSTR, + client->state.app_id, client->state.client_id); shm_context* shm_ctx = unifyfs_shm_alloc(shm_name, super_sz); if (NULL == shm_ctx) { LOGERR("Failed to attach to shmem superblock region %s", shm_name); return UNIFYFS_ERROR_SHMEM; } - shm_super_ctx = shm_ctx; + client->state.shm_super_ctx = shm_ctx; /* init our global variables to point to spots in superblock */ void* addr = shm_ctx->addr; - init_superblock_pointers(addr); + init_superblock_pointers(client, addr); /* initialize structures in superblock if it's newly allocated, * we depend on shm_open setting all bytes to 0 to know that @@ -1588,7 +306,7 @@ static int init_superblock_shm(size_t super_sz) uint32_t initialized = *(uint32_t*)addr; if (initialized == 0) { /* not yet initialized, so initialize values within superblock */ - init_superblock_structures(); + init_superblock_structures(client); /* superblock structure has been initialized, * so set flag to indicate that fact */ @@ -1605,19 +323,20 @@ static int init_superblock_shm(size_t super_sz) /* Clear any index entries from the cache. We do this to ensure * the newly allocated seg trees are consistent with the extents - * in the index. It would be nice to call unifyfs_sync_extents to flush + * in the index. It would be nice to call unifyfs_sync_extents to flush * any entries to the server, but we can't do that since that will * try to rewrite the index using the trees, which point to invalid * memory at this point. */ + /* initialize count of key/value entries */ - *(unifyfs_indices.ptr_num_entries) = 0; + *(client->state.write_index.ptr_num_entries) = 0; - int i; - for (i = 0; i < unifyfs_max_files; i++) { + unifyfs_filemeta_t* meta; + for (int i = 0; i < client->max_files; i++) { /* if the file entry is active, reset its segment trees */ - if (unifyfs_filelist[i].in_use) { + if (client->unifyfs_filelist[i].in_use) { /* got a live file, get pointer to its metadata */ - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(i); + meta = unifyfs_get_meta_from_fid(client, i); assert(meta != NULL); /* Reset our segment tree that will record our writes */ @@ -1625,7 +344,7 @@ static int init_superblock_shm(size_t super_sz) /* Reset our segment tree to track extents for all writes * by this process, can be used to read back local data */ - if (unifyfs_local_extents) { + if (client->use_local_extents) { seg_tree_init(&meta->extents); } } @@ -1636,515 +355,190 @@ static int init_superblock_shm(size_t super_sz) return UNIFYFS_SUCCESS; } -int unifyfs_init(unifyfs_cfg_t* clnt_cfg) +int unifyfs_client_init(unifyfs_client* client) { int rc; - int i; - bool b; - long l; - unsigned long long bits; - char* cfgval; - - if (!unifyfs_initialized) { - -#ifdef UNIFYFS_GOTCHA - rc = setup_gotcha_wrappers(); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to setup gotcha wrappers"); - return rc; - } -#endif - - /* as a hack to support fgetpos/fsetpos, we store the value of - * a void* in an fpos_t so check that there's room and at least - * print a message if this won't work */ - if (sizeof(fpos_t) < sizeof(void*)) { - LOGERR("fgetpos/fsetpos will not work correctly"); - unifyfs_fpos_enabled = 0; - } - - /* look up page size for buffer alignment */ - unifyfs_page_size = get_page_size(); - /* compute min and max off_t values */ - bits = sizeof(off_t) * 8; - unifyfs_max_offt = (off_t)((1ULL << (bits - 1ULL)) - 1ULL); - unifyfs_min_offt = (off_t)(-(1ULL << (bits - 1ULL))); - - /* compute min and max long values */ - unifyfs_max_long = LONG_MAX; - unifyfs_min_long = LONG_MIN; - - /* set our current working directory if user gave us one */ - cfgval = clnt_cfg->client_cwd; - if (cfgval != NULL) { - unifyfs_cwd = strdup(cfgval); - - /* check that cwd falls somewhere under the mount point */ - int cwd_within_mount = 0; - if (strncmp(unifyfs_cwd, unifyfs_mount_prefix, - unifyfs_mount_prefixlen) == 0) { - /* characters in target up through mount point match, - * assume we match */ - cwd_within_mount = 1; - - /* if we have another character, it must be '/' */ - if (strlen(unifyfs_cwd) > unifyfs_mount_prefixlen && - unifyfs_cwd[unifyfs_mount_prefixlen] != '/') { - cwd_within_mount = 0; - } - } - if (!cwd_within_mount) { - /* path given in CWD is outside of the UnifyFS mount point */ - LOGERR("UNIFYFS_CLIENT_CWD '%s' must be within the mount '%s'", - unifyfs_cwd, unifyfs_mount_prefix); - - /* ignore setting and set back to NULL */ - free(unifyfs_cwd); - unifyfs_cwd = NULL; - } - } else { - /* user did not specify a CWD, so initialize with the actual - * current working dir */ - char* cwd = getcwd(NULL, 0); - if (cwd != NULL) { - unifyfs_cwd = cwd; - } else { - LOGERR("Failed getcwd (%s)", strerror(errno)); - } - } + if (NULL == client) { + return EINVAL; + } - /* determine max number of files to store in file system */ - unifyfs_max_files = UNIFYFS_CLIENT_MAX_FILES; - cfgval = clnt_cfg->client_max_files; - if (cfgval != NULL) { - rc = configurator_int_val(cfgval, &l); - if (rc == 0) { - unifyfs_max_files = (int)l; - } + /* add client to client_list */ + if (NULL == client_list) { + client_list = arraylist_create(0); + if (NULL == client_list) { + LOGERR("failed to create client_list arraylist"); + return UNIFYFS_FAILURE; } + } + rc = arraylist_add(client_list, client); + if (rc == -1) { + LOGERR("failed to add client to client_list arraylist"); + return UNIFYFS_FAILURE; + } - /* Determine if we should track all write extents and use them - * to service read requests if all data is local */ - unifyfs_local_extents = 0; - cfgval = clnt_cfg->client_local_extents; - if (cfgval != NULL) { - rc = configurator_bool_val(cfgval, &b); - if (rc == 0) { - unifyfs_local_extents = (bool)b; - } - } + if (!client->state.initialized) { - /* Determine whether we automatically sync every write to server. - * This slows write performance, but it can serve as a work - * around for apps that do not have all necessary syncs. */ - unifyfs_write_sync = false; - cfgval = clnt_cfg->client_write_sync; - if (cfgval != NULL) { - rc = configurator_bool_val(cfgval, &b); - if (rc == 0) { - unifyfs_write_sync = (bool)b; - } - } + // print log messages to stderr + unifyfs_log_open(NULL); - /* Determine SUPER MAGIC value to return from statfs. - * Use UNIFYFS_SUPER_MAGIC if true, TMPFS_SUPER_MAGIC otherwise. */ - unifyfs_super_magic = true; - cfgval = client_cfg.client_super_magic; - if (cfgval != NULL) { - rc = configurator_bool_val(cfgval, &b); - if (rc == 0) { - unifyfs_super_magic = (bool)b; - } + // initialize configuration + unifyfs_cfg_t* client_cfg = &(client->cfg); + rc = unifyfs_config_init(client_cfg, 0, NULL, 0, NULL); + if (rc) { + LOGERR("failed to initialize configuration."); + return UNIFYFS_FAILURE; } + client_cfg->ptype = UNIFYFS_CLIENT; - /* define size of buffer used to cache key/value pairs for - * data offsets before passing them to the server */ - unifyfs_index_buf_size = UNIFYFS_CLIENT_WRITE_INDEX_SIZE; - cfgval = clnt_cfg->client_write_index_size; + // set log level from config + char* cfgval = client_cfg->log_verbosity; if (cfgval != NULL) { + long l; rc = configurator_int_val(cfgval, &l); if (rc == 0) { - unifyfs_index_buf_size = (size_t)l; + unifyfs_set_log_level((unifyfs_log_level_t)l); } } - unifyfs_max_index_entries = - unifyfs_index_buf_size / sizeof(unifyfs_index_t); - - /* record the max fd for the system */ - /* RLIMIT_NOFILE specifies a value one greater than the maximum - * file descriptor number that can be opened by this process */ - struct rlimit r_limit; - - if (getrlimit(RLIMIT_NOFILE, &r_limit) < 0) { - LOGERR("getrlimit failed: errno=%d (%s)", errno, strerror(errno)); - return UNIFYFS_FAILURE; - } - unifyfs_fd_limit = r_limit.rlim_cur; - LOGDBG("FD limit for system = %ld", unifyfs_fd_limit); - - /* initialize file descriptor structures */ - int num_fds = UNIFYFS_CLIENT_MAX_FILEDESCS; - for (i = 0; i < num_fds; i++) { - unifyfs_fd_init(i); - } - - /* initialize file stream structures */ - int num_streams = UNIFYFS_CLIENT_MAX_FILEDESCS; - for (i = 0; i < num_streams; i++) { - unifyfs_stream_init(i); - } - - /* initialize directory stream structures */ - int num_dirstreams = UNIFYFS_CLIENT_MAX_FILEDESCS; - for (i = 0; i < num_dirstreams; i++) { - unifyfs_dirstream_init(i); - } - - /* initialize stack of free fd values */ - size_t free_fd_size = unifyfs_stack_bytes(num_fds); - unifyfs_fd_stack = malloc(free_fd_size); - unifyfs_stack_init(unifyfs_fd_stack, num_fds); - - /* initialize stack of free stream values */ - size_t free_stream_size = unifyfs_stack_bytes(num_streams); - unifyfs_stream_stack = malloc(free_stream_size); - unifyfs_stack_init(unifyfs_stream_stack, num_streams); - - /* initialize stack of free directory stream values */ - size_t free_dirstream_size = unifyfs_stack_bytes(num_dirstreams); - unifyfs_dirstream_stack = malloc(free_dirstream_size); - unifyfs_stack_init(unifyfs_dirstream_stack, num_dirstreams); /* determine the size of the superblock */ - size_t shm_super_size = get_superblock_size(); + size_t shm_super_size = get_superblock_size(client); /* get a superblock of shared memory and initialize our * global variables for this block */ - rc = init_superblock_shm(shm_super_size); + int rc = init_superblock_shm(client, shm_super_size); if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to initialize superblock shmem"); return rc; } - /* initialize active_mreads arraylist */ - active_mreads = arraylist_create(UNIFYFS_CLIENT_MAX_READ_COUNT); - if (NULL == active_mreads) { - LOGERR("failed to create arraylist for active reads"); - return UNIFYFS_FAILURE; - } - /* initialize log-based I/O context */ - rc = unifyfs_logio_init_client(unifyfs_app_id, unifyfs_client_id, - clnt_cfg, &logio_ctx); + rc = unifyfs_logio_init_client(client->state.app_id, + client->state.client_id, + &(client->cfg), + &(client->state.logio_ctx)); if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to initialize log-based I/O (rc = %s)", unifyfs_rc_enum_str(rc)); return rc; } + /* initialize client->active_mreads arraylist */ + client->active_mreads = + arraylist_create(UNIFYFS_CLIENT_MAX_READ_COUNT); + if (NULL == client->active_mreads) { + LOGERR("failed to create arraylist for active reads"); + return UNIFYFS_FAILURE; + } + + /* initialize client->active_mreads arraylist */ + client->active_transfers = + arraylist_create(UNIFYFS_CLIENT_MAX_FILES); + if (NULL == client->active_transfers) { + LOGERR("failed to create arraylist for active transfers"); + return UNIFYFS_FAILURE; + } + /* remember that we've now initialized the library */ - unifyfs_initialized = 1; + client->state.initialized = 1; } return UNIFYFS_SUCCESS; } -/* free resources allocated during unifyfs_init(). +/* free resources allocated during unifyfs_client_init(). * generally, we do this in reverse order with respect to * how things were initialized */ -int unifyfs_fini(void) +int unifyfs_client_fini(unifyfs_client* client) { int rc = UNIFYFS_SUCCESS; - if (!unifyfs_initialized) { - /* not initialized yet, so we shouldn't call finalize */ - return UNIFYFS_FAILURE; + if (NULL == client) { + return EINVAL; } - /* close spillover files */ - if (NULL != logio_ctx) { - unifyfs_logio_close(logio_ctx, 0); - logio_ctx = NULL; - } - if (unifyfs_spillmetablock != -1) { - close(unifyfs_spillmetablock); - unifyfs_spillmetablock = -1; + int list_pos = -1; + unifyfs_client* list_clnt = unifyfs_find_client(client->state.app_id, + client->state.client_id, + &list_pos); + if (list_clnt != client) { + LOGWARN("mismatch on client_list client"); + } else { + /* remove from client_list */ + arraylist_remove(client_list, list_pos); } - /* detach from superblock shmem, but don't unlink the file so that - * a later client can reattach. */ - unifyfs_shm_free(&shm_super_ctx); + if (!client->state.initialized) { + /* not initialized yet, so we shouldn't call finalize */ + return UNIFYFS_FAILURE; + } - /* free directory stream stack */ - if (unifyfs_dirstream_stack != NULL) { - free(unifyfs_dirstream_stack); - unifyfs_dirstream_stack = NULL; + if (NULL != client->active_mreads) { + arraylist_free(client->active_mreads); } - /* free file stream stack */ - if (unifyfs_stream_stack != NULL) { - free(unifyfs_stream_stack); - unifyfs_stream_stack = NULL; + if (NULL != client->active_transfers) { + arraylist_free(client->active_transfers); } - /* free file descriptor stack */ - if (unifyfs_fd_stack != NULL) { - free(unifyfs_fd_stack); - unifyfs_fd_stack = NULL; + /* close spillover files */ + if (NULL != client->state.logio_ctx) { + unifyfs_logio_close(client->state.logio_ctx, 0); + client->state.logio_ctx = NULL; } + /* detach from superblock shmem, but don't unlink the file so that + * a later client can reattach. */ + unifyfs_shm_free(&(client->state.shm_super_ctx)); + /* no longer initialized, so update the flag */ - unifyfs_initialized = 0; + client->state.initialized = 0; return rc; } - -/* --------------- - * external APIs - * --------------- */ - -/* Fill mount rpc input struct with client-side context info */ -void fill_client_mount_info(unifyfs_cfg_t* clnt_cfg, - unifyfs_mount_in_t* in) -{ - in->dbg_rank = client_rank; - in->mount_prefix = strdup(clnt_cfg->unifyfs_mountpoint); -} - -/* Fill attach rpc input struct with client-side context info */ -void fill_client_attach_info(unifyfs_cfg_t* clnt_cfg, - unifyfs_attach_in_t* in) -{ - size_t meta_offset = (char*)unifyfs_indices.ptr_num_entries - - (char*)shm_super_ctx->addr; - size_t meta_size = unifyfs_max_index_entries - * sizeof(unifyfs_index_t); - - in->app_id = unifyfs_app_id; - in->client_id = unifyfs_client_id; - in->shmem_super_size = shm_super_ctx->size; - in->meta_offset = meta_offset; - in->meta_size = meta_size; - - if (NULL != logio_ctx->shmem) { - in->logio_mem_size = logio_ctx->shmem->size; - } else { - in->logio_mem_size = 0; - } - - in->logio_spill_size = logio_ctx->spill_sz; - if (logio_ctx->spill_sz) { - in->logio_spill_dir = strdup(clnt_cfg->logio_spill_dir); - } else { - in->logio_spill_dir = NULL; - } -} - -/** - * mount a file system at a given prefix - * subtype: 0-> log-based file system; - * 1->striping based file system, not implemented yet. - * @param prefix: directory prefix - * @param size: the number of ranks - * @param l_app_id: application ID - * @return success/error code - */ -int unifyfs_mount( - const char prefix[], - int rank, - size_t size, - int l_app_id) +/* find client in client_list with given app_id and client_id */ +unifyfs_client* unifyfs_find_client(int app_id, + int client_id, + int* list_position) { - int rc; - int kv_rank, kv_nranks; - - if (-1 != unifyfs_mounted) { - if (l_app_id != unifyfs_mounted) { - LOGERR("multiple mount support not yet implemented"); - return UNIFYFS_FAILURE; - } else { - LOGDBG("already mounted"); - return UNIFYFS_SUCCESS; - } - } - - // record our rank for debugging messages - client_rank = rank; - global_rank_cnt = (int)size; - - // print log messages to stderr - unifyfs_log_open(NULL); - - // initialize configuration - rc = unifyfs_config_init(&client_cfg, 0, NULL, 0, NULL); - if (rc) { - LOGERR("failed to initialize configuration."); - return UNIFYFS_FAILURE; - } - client_cfg.ptype = UNIFYFS_CLIENT; - - // set log level from config - char* cfgval = client_cfg.log_verbosity; - if (cfgval != NULL) { - long l; - rc = configurator_int_val(cfgval, &l); - if (rc == 0) { - unifyfs_set_log_level((unifyfs_log_level_t)l); - } - } - - // record mountpoint prefix string - unifyfs_mount_prefix = strdup(prefix); - unifyfs_mount_prefixlen = strlen(unifyfs_mount_prefix); - client_cfg.unifyfs_mountpoint = unifyfs_mount_prefix; - - // generate app_id from mountpoint prefix - unifyfs_app_id = unifyfs_generate_gfid(unifyfs_mount_prefix); - if (l_app_id != 0) { - LOGDBG("ignoring passed app_id=%d, using mountpoint app_id=%d", - l_app_id, unifyfs_app_id); - } - - // initialize k-v store access - kv_rank = client_rank; - kv_nranks = size; - rc = unifyfs_keyval_init(&client_cfg, &kv_rank, &kv_nranks); - if (rc) { - LOGERR("failed to initialize kvstore"); - return UNIFYFS_FAILURE; - } - if ((client_rank != kv_rank) || (size != kv_nranks)) { - LOGDBG("mismatch on mount vs kvstore rank/size"); - } - - /* open rpc connection to server */ - rc = unifyfs_client_rpc_init(); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to initialize client RPC"); - return rc; - } - - /* Call client mount rpc function to get client id */ - LOGDBG("calling mount rpc"); - rc = invoke_client_mount_rpc(&client_cfg); - if (rc != UNIFYFS_SUCCESS) { - /* If we fail to connect to the server, bail with an error */ - LOGERR("failed to mount to server"); - return rc; - } - - /* initialize our library using assigned client id, creates shared memory - * regions (e.g., superblock and data recv) and inits log-based I/O */ - rc = unifyfs_init(&client_cfg); - if (rc != UNIFYFS_SUCCESS) { - return rc; - } - - /* Call client attach rpc function to register our newly created shared - * memory and files with server */ - LOGDBG("calling attach rpc"); - rc = invoke_client_attach_rpc(&client_cfg); - if (rc != UNIFYFS_SUCCESS) { - /* If we fail, bail with an error */ - LOGERR("failed to attach to server"); - unifyfs_fini(); - return rc; + if (NULL == client_list) { + return NULL; } - /* add mount point as a new directory in the file list */ - int fid = unifyfs_get_fid_from_path(prefix); - if (fid < 0) { - /* no entry exists for mount point, so create one */ - rc = unifyfs_fid_create_directory(prefix); - if ((rc != UNIFYFS_SUCCESS) && (rc != EEXIST)) { - /* if there was an error other than EEXIST, return it */ - LOGERR("failed to create directory for mount point: %s", prefix); - unifyfs_fini(); - return rc; + int n_clients = arraylist_size(client_list); + for (int i = 0; i < n_clients; i++) { + void* item = arraylist_get(client_list, i); + if (NULL != item) { + unifyfs_client* clnt = (unifyfs_client*) item; + if ((clnt->state.app_id == app_id) && + (clnt->state.client_id == client_id)) { + if (NULL != list_position) { + *list_position = i; + } + return clnt; + } } } - - /* record client state as mounted for specific app_id */ - unifyfs_mounted = unifyfs_app_id; - - return UNIFYFS_SUCCESS; + return NULL; } -/** - * unmount the mounted file system - * TODO: Add support for unmounting more than - * one filesystem. - * @return success/error code - */ -int unifyfs_unmount(void) +/* Sync all the write extents for any client files to the server. + * Returns UNIFYFS_SUCCESS on success, failure code otherwise */ +int unifyfs_sync_files(unifyfs_client* client) { int ret = UNIFYFS_SUCCESS; - if (-1 == unifyfs_mounted) { - return UNIFYFS_SUCCESS; - } - - /* sync any outstanding writes */ - LOGDBG("syncing data"); - int rc = unifyfs_sync_extents(-1); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("client sync failed"); - ret = UNIFYFS_FAILURE; - } - - /************************ - * tear down connection to server - ************************/ - - /* invoke unmount rpc to tell server we're disconnecting */ - LOGDBG("calling unmount"); - rc = invoke_client_unmount_rpc(); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("client unmount rpc failed"); - ret = UNIFYFS_FAILURE; - } - - /* free resources allocated in client_rpc_init */ - unifyfs_client_rpc_finalize(); - - /************************ - * free our mount point, and detach from structures - * storing data - ************************/ - - /* free resources allocated in unifyfs_init */ - unifyfs_fini(); - - /* free memory tracking our mount prefix string */ - if (unifyfs_mount_prefix != NULL) { - free(unifyfs_mount_prefix); - unifyfs_mount_prefix = NULL; - unifyfs_mount_prefixlen = 0; - client_cfg.unifyfs_mountpoint = NULL; - } - - /************************ - * free configuration values - ************************/ - - /* free global holding current working directory */ - if (unifyfs_cwd != NULL) { - free(unifyfs_cwd); - } - - /* clean up configuration */ - rc = unifyfs_config_fini(&client_cfg); - if (rc != 0) { - LOGERR("unifyfs_config_fini() failed"); - ret = UNIFYFS_FAILURE; + /* sync every active file */ + for (int i = 0; i < client->max_files; i++) { + if (client->unifyfs_filelist[i].in_use) { + /* got an active file, so sync this file id */ + int rc = unifyfs_fid_sync_extents(client, i); + if (UNIFYFS_SUCCESS != rc) { + ret = rc; + } + } } - /* shut down our logging */ - unifyfs_log_close(); - - unifyfs_mounted = -1; - return ret; } diff --git a/client/src/unifyfs.h b/client/src/unifyfs.h index 6852c230a..fbb9e2fab 100644 --- a/client/src/unifyfs.h +++ b/client/src/unifyfs.h @@ -27,32 +27,56 @@ extern "C" { /* "UnifyFS!" in ASCII */ #define UNIFYFS_SUPER_MAGIC (0x556E696679465321) -int unifyfs_mount(const char prefix[], int rank, size_t size, - int l_app_id); +/** + * Mount UnifyFS file system at given prefix + * + * @param prefix mountpoint prefix + * @param rank client rank within application + * @param size the number of application clients + * + * @return success/error code + */ +int unifyfs_mount(const char prefix[], int rank, size_t size); + +/** + * Unmount the UnifyFS file system + * + * @return success/error code + */ int unifyfs_unmount(void); + +/* Enumeration to control transfer mode */ +enum { + UNIFYFS_TRANSFER_SERIAL = 0, + UNIFYFS_TRANSFER_PARALLEL = 1, +}; + /** - * @brief transfer a single file between unifyfs and other file system. either - * @src or @dst should (not both) specify a unifyfs pathname, i.e., /unifyfs/.. + * Transfer a single file between UnifyFS and another file system. + * Either @src or @dst (not both) should specify a path within + * the UnifyFS namespace prefixed by the mountpoint, e.g., /unifyfs/.. * - * @param src source file path - * @param dst destination file path - * @param parallel parallel transfer if set (parallel=1) + * @param src source file path + * @param dst destination file path + * @param mode transfer mode * * @return 0 on success, negative errno otherwise. */ -int unifyfs_transfer_file(const char* src, const char* dst, int parallel); +int unifyfs_transfer_file(const char* src, + const char* dst, + int mode); static inline int unifyfs_transfer_file_serial(const char* src, const char* dst) { - return unifyfs_transfer_file(src, dst, 0); + return unifyfs_transfer_file(src, dst, UNIFYFS_TRANSFER_SERIAL); } static inline int unifyfs_transfer_file_parallel(const char* src, const char* dst) { - return unifyfs_transfer_file(src, dst, 1); + return unifyfs_transfer_file(src, dst, UNIFYFS_TRANSFER_PARALLEL); } diff --git a/client/src/unifyfs_api.c b/client/src/unifyfs_api.c index f6f61c160..97d87e171 100644 --- a/client/src/unifyfs_api.c +++ b/client/src/unifyfs_api.c @@ -13,13 +13,14 @@ */ #include "unifyfs_api_internal.h" +#include "unifyfs_fid.h" +#include "margo_client.h" /* * Public Methods */ /* Initialize client's use of UnifyFS */ -// TODO: replace unifyfs_mount() unifyfs_rc unifyfs_initialize(const char* mountpoint, unifyfs_cfg_option* options, int n_opts, unifyfs_handle* fshdl) @@ -38,10 +39,10 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, LOGERR("failed to allocate client handle"); return ENOMEM; } - unifyfs_app_id = unifyfs_generate_gfid(mountpoint); - client->app_id = unifyfs_app_id; // initialize configuration + long l; + bool b; unifyfs_cfg_t* client_cfg = &(client->cfg); int rc = unifyfs_config_init(client_cfg, 0, NULL, n_opts, options); if (rc) { @@ -50,19 +51,114 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, } client_cfg->ptype = UNIFYFS_CLIENT; client_cfg->unifyfs_mountpoint = strdup(mountpoint); - unifyfs_mount_prefix = client_cfg->unifyfs_mountpoint; - unifyfs_mount_prefixlen = strlen(unifyfs_mount_prefix); + client->state.mount_prefix = strdup(mountpoint); + client->state.mount_prefixlen = strlen(mountpoint); - // set log level from config - char* cfgval = client_cfg->log_verbosity; + /* set our current working directory if user provided one */ + char* cfgval = client_cfg->client_cwd; + if (cfgval != NULL) { + client->cwd = strdup(cfgval); + + /* check that cwd falls somewhere under the mount point */ + int cwd_within_mount = 0; + if (0 == strncmp(client->cwd, client->state.mount_prefix, + client->state.mount_prefixlen)) { + /* characters in target up through mount point match, + * assume we match */ + cwd_within_mount = 1; + + /* if we have another character, it must be '/' */ + if ((strlen(client->cwd) > client->state.mount_prefixlen) && + (client->cwd[client->state.mount_prefixlen] != '/')) { + cwd_within_mount = 0; + } + } + if (!cwd_within_mount) { + /* path given in CWD is outside of the UnifyFS mount point */ + LOGERR("UNIFYFS_CLIENT_CWD '%s' must be within the mount '%s'", + client->cwd, client->state.mount_prefix); + + /* ignore setting and set back to NULL */ + free(client->cwd); + client->cwd = NULL; + } + } else { + /* user did not specify a CWD, so initialize with the actual + * current working dir */ + char* cwd = getcwd(NULL, 0); + if (cwd != NULL) { + client->cwd = cwd; + } else { + LOGERR("Failed getcwd (%s)", strerror(errno)); + } + } + + /* set log level from config */ + cfgval = client_cfg->log_verbosity; if (cfgval != NULL) { - long l; rc = configurator_int_val(cfgval, &l); if (rc == 0) { unifyfs_set_log_level((unifyfs_log_level_t)l); } } + /* determine max number of files to store in file system */ + client->max_files = UNIFYFS_CLIENT_MAX_FILES; + cfgval = client_cfg->client_max_files; + if (cfgval != NULL) { + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + client->max_files = (int)l; + } + } + + /* Determine if we should track all write extents and use them + * to service read requests if all data is local */ + client->use_local_extents = 0; + cfgval = client_cfg->client_local_extents; + if (cfgval != NULL) { + rc = configurator_bool_val(cfgval, &b); + if (rc == 0) { + client->use_local_extents = (bool)b; + } + } + + /* Determine whether we automatically sync every write to server. + * This slows write performance, but it can serve as a work + * around for apps that do not have all necessary syncs. */ + client->use_write_sync = false; + cfgval = client_cfg->client_write_sync; + if (cfgval != NULL) { + rc = configurator_bool_val(cfgval, &b); + if (rc == 0) { + client->use_write_sync = (bool)b; + } + } + + /* Determine SUPER MAGIC value to return from statfs. + * Use UNIFYFS_SUPER_MAGIC if true, TMPFS_SUPER_MAGIC otherwise. */ + client->use_unifyfs_magic = true; + cfgval = client_cfg->client_super_magic; + if (cfgval != NULL) { + rc = configurator_bool_val(cfgval, &b); + if (rc == 0) { + client->use_unifyfs_magic = (bool)b; + } + } + + /* Define size of buffer used to cache key/value pairs for + * data offsets before passing them to the server */ + client->write_index_size = UNIFYFS_CLIENT_WRITE_INDEX_SIZE; + cfgval = client_cfg->client_write_index_size; + if (cfgval != NULL) { + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + client->write_index_size = (size_t)l; + } + } + client->max_write_index_entries = + client->write_index_size / sizeof(unifyfs_index_t); + // initialize k-v store access int kv_rank = 0; int kv_nranks = 1; @@ -80,20 +176,20 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, } /* Call client mount rpc function to get client id */ + int app_id = unifyfs_generate_gfid(mountpoint); + client->state.app_id = app_id; LOGDBG("calling mount rpc"); - rc = invoke_client_mount_rpc(client_cfg); + rc = invoke_client_mount_rpc(client); if (rc != UNIFYFS_SUCCESS) { /* If we fail to connect to the server, bail with an error */ LOGERR("failed to mount to server"); return rc; } - unifyfs_mounted = unifyfs_app_id; - client->is_mounted = true; - client->client_id = unifyfs_client_id; + client->state.is_mounted = true; /* initialize our library using assigned client id, creates shared memory * regions (e.g., superblock and data recv) and inits log-based I/O */ - rc = unifyfs_init(client_cfg); + rc = unifyfs_client_init(client); if (rc != UNIFYFS_SUCCESS) { return rc; } @@ -101,23 +197,23 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, /* Call client attach rpc function to register our newly created shared * memory and files with server */ LOGDBG("calling attach rpc"); - rc = invoke_client_attach_rpc(client_cfg); + rc = invoke_client_attach_rpc(client); if (rc != UNIFYFS_SUCCESS) { /* If we fail, bail with an error */ LOGERR("failed to attach to server"); - unifyfs_fini(); + unifyfs_client_fini(client); return rc; } /* add mount point as a new directory in the file list */ - if (unifyfs_get_fid_from_path(mountpoint) < 0) { + if (unifyfs_fid_from_path(client, mountpoint) < 0) { /* no entry exists for mount point, so create one */ - int fid = unifyfs_fid_create_directory(mountpoint); + int fid = unifyfs_fid_create_directory(client, mountpoint); if (fid < 0) { /* if there was an error, return it */ LOGERR("failed to create directory entry for mount point: `%s'", mountpoint); - unifyfs_fini(); + unifyfs_client_fini(client); return UNIFYFS_FAILURE; } } @@ -128,7 +224,6 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, } /* Finalize client's use of UnifyFS */ -// TODO: replace unifyfs_unmount() unifyfs_rc unifyfs_finalize(unifyfs_handle fshdl) { if (UNIFYFS_INVALID_HANDLE == fshdl) { @@ -138,10 +233,10 @@ unifyfs_rc unifyfs_finalize(unifyfs_handle fshdl) int ret = UNIFYFS_SUCCESS; - if (client->is_mounted) { + if (client->state.is_mounted) { /* sync any outstanding writes */ LOGDBG("syncing data"); - int rc = unifyfs_sync_extents(-1); + int rc = unifyfs_sync_files(client); if (rc != UNIFYFS_SUCCESS) { LOGERR("client sync failed"); ret = rc; @@ -149,42 +244,30 @@ unifyfs_rc unifyfs_finalize(unifyfs_handle fshdl) /* invoke unmount rpc to tell server we're disconnecting */ LOGDBG("calling unmount"); - rc = invoke_client_unmount_rpc(); + rc = invoke_client_unmount_rpc(client); if (rc != UNIFYFS_SUCCESS) { LOGERR("client unmount rpc failed"); ret = rc; } - - unifyfs_mounted = -1; } /* free resources allocated in client_rpc_init */ unifyfs_client_rpc_finalize(); - /************************ - * free our mount point, and detach from structures - * storing data - ************************/ - - /* free resources allocated in unifyfs_init */ - unifyfs_fini(); + /* free resources allocated in unifyfs_client_init */ + unifyfs_client_fini(client); /* free memory tracking our mount prefix string */ - if (unifyfs_mount_prefix != NULL) { - free(unifyfs_mount_prefix); - unifyfs_mount_prefix = NULL; - unifyfs_mount_prefixlen = 0; - client->cfg.unifyfs_mountpoint = NULL; + if (client->state.mount_prefix != NULL) { + free(client->state.mount_prefix); + client->state.mount_prefix = NULL; + client->state.mount_prefixlen = 0; } - /************************ - * free configuration values - ************************/ - /* free global holding current working directory */ - if (unifyfs_cwd != NULL) { - free(unifyfs_cwd); - unifyfs_cwd = NULL; + if (client->cwd != NULL) { + free(client->cwd); + client->cwd = NULL; } /* clean up configuration */ diff --git a/client/src/unifyfs_api.h b/client/src/unifyfs_api.h index 34d230e9a..b86cf78ab 100644 --- a/client/src/unifyfs_api.h +++ b/client/src/unifyfs_api.h @@ -184,12 +184,14 @@ unifyfs_rc unifyfs_create(unifyfs_handle fshdl, * Open an existing file in UnifyFS. * * @param[in] fshdl Client file system handle + * @param[in] flags File access flags * @param[in] filepath Path of file to open * @param[out] gfid Global file id of opened file * * @return UnifyFS success or failure code */ unifyfs_rc unifyfs_open(unifyfs_handle fshdl, + const int flags, const char* filepath, unifyfs_gfid* gfid); diff --git a/client/src/unifyfs_api_file.c b/client/src/unifyfs_api_file.c index cc61b766d..ada064b05 100644 --- a/client/src/unifyfs_api_file.c +++ b/client/src/unifyfs_api_file.c @@ -13,8 +13,38 @@ */ #include "unifyfs_api_internal.h" +#include "unifyfs_fid.h" +#include "margo_client.h" +/* + * Internal Methods + */ + +bool is_unifyfs_path(unifyfs_client* client, + const char* filepath) +{ + /* the library API expects absolute paths without relative components, + * so we don't do any path normalization here */ + + /* if the path starts with our mount point, intercept it */ + bool intercept = false; + char* mount = client->state.mount_prefix; + size_t len = client->state.mount_prefixlen; + if (strncmp(filepath, mount, len) == 0) { + /* characters in target up through mount point match, + * so assume we match */ + intercept = true; + + /* if we have another character, it must be '/' */ + if ((strlen(filepath) > len) && + (filepath[len] != '/')) { + intercept = 0; + } + } + return intercept; +} + /* * Public Methods */ @@ -32,6 +62,13 @@ unifyfs_rc unifyfs_create(unifyfs_handle fshdl, } *gfid = UNIFYFS_INVALID_GFID; + unifyfs_client* client = fshdl; + + /* make sure requested file is within client namespace */ + if (!is_unifyfs_path(client, filepath)) { + return (unifyfs_rc)EINVAL; + } + /* NOTE: the 'flags' parameter is not currently used. it is reserved * for future indication of file-specific behavior */ @@ -41,8 +78,9 @@ unifyfs_rc unifyfs_create(unifyfs_handle fshdl, off_t filepos = -1; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; - int create_flags = (O_CREAT | O_EXCL); - int rc = unifyfs_fid_open(filepath, create_flags, mode, &fid, &filepos); + int create_flags = O_CREAT; + int rc = unifyfs_fid_open(client, filepath, create_flags, mode, + &fid, &filepos); if (UNIFYFS_SUCCESS == rc) { *gfid = unifyfs_generate_gfid(filepath); } @@ -51,6 +89,7 @@ unifyfs_rc unifyfs_create(unifyfs_handle fshdl, /* Open an existing file in UnifyFS */ unifyfs_rc unifyfs_open(unifyfs_handle fshdl, + const int flags, const char* filepath, unifyfs_gfid* gfid) { @@ -61,14 +100,20 @@ unifyfs_rc unifyfs_open(unifyfs_handle fshdl, } *gfid = UNIFYFS_INVALID_GFID; + unifyfs_client* client = fshdl; + + /* make sure requested file is within client namespace */ + if (!is_unifyfs_path(client, filepath)) { + return (unifyfs_rc)EINVAL; + } + /* the output parameters of unifyfs_fid_open() are not used here, but * must be provided */ int fid = -1; off_t filepos = -1; mode_t mode = 0; - int flags = O_RDWR; - int rc = unifyfs_fid_open(filepath, flags, mode, &fid, &filepos); + int rc = unifyfs_fid_open(client, filepath, flags, mode, &fid, &filepos); if (UNIFYFS_SUCCESS == rc) { *gfid = unifyfs_generate_gfid(filepath); } @@ -84,12 +129,14 @@ unifyfs_rc unifyfs_sync(unifyfs_handle fshdl, return (unifyfs_rc)EINVAL; } - int fid = unifyfs_fid_from_gfid((int)gfid); + unifyfs_client* client = fshdl; + + int fid = unifyfs_fid_from_gfid(client, (int)gfid); if (-1 == fid) { return (unifyfs_rc)EINVAL; } - int rc = unifyfs_fid_sync(fid); + int rc = unifyfs_fid_sync_extents(client, fid); return (unifyfs_rc)rc; } @@ -104,12 +151,14 @@ unifyfs_rc unifyfs_stat(unifyfs_handle fshdl, return (unifyfs_rc)EINVAL; } - int fid = unifyfs_fid_from_gfid((int)gfid); + unifyfs_client* client = fshdl; + + int fid = unifyfs_fid_from_gfid(client, (int)gfid); if (-1 == fid) { return (unifyfs_rc)EINVAL; } - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(fid); + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); if (meta == NULL) { LOGERR("missing local file metadata for gfid=%d", (int)gfid); return UNIFYFS_FAILURE; @@ -117,12 +166,12 @@ unifyfs_rc unifyfs_stat(unifyfs_handle fshdl, /* get global metadata to pick up current file size */ unifyfs_file_attr_t attr = {0}; - int rc = unifyfs_get_global_file_meta((int)gfid, &attr); + int rc = unifyfs_get_global_file_meta(client, (int)gfid, &attr); if (UNIFYFS_SUCCESS != rc) { LOGERR("missing global file metadata for gfid=%d", (int)gfid); } else { /* update local file metadata from global metadata */ - unifyfs_fid_update_file_meta(fid, &attr); + unifyfs_fid_update_file_meta(client, fid, &attr); } st->global_file_size = meta->attrs.size; @@ -144,21 +193,28 @@ unifyfs_rc unifyfs_laminate(unifyfs_handle fshdl, return (unifyfs_rc)EINVAL; } + unifyfs_client* client = fshdl; + + /* make sure requested file is within client namespace */ + if (!is_unifyfs_path(client, filepath)) { + return (unifyfs_rc)EINVAL; + } + int gfid = unifyfs_generate_gfid(filepath); - int rc = invoke_client_laminate_rpc(gfid); + int rc = invoke_client_laminate_rpc(client, gfid); if (UNIFYFS_SUCCESS == rc) { /* update the local state for this file (if any) */ - int fid = unifyfs_fid_from_gfid((int)gfid); + int fid = unifyfs_fid_from_gfid(client, (int)gfid); if (-1 != fid) { /* get global metadata to pick up file size and laminated flag */ unifyfs_file_attr_t attr = {0}; - rc = unifyfs_get_global_file_meta(gfid, &attr); + rc = unifyfs_get_global_file_meta(client, gfid, &attr); if (UNIFYFS_SUCCESS != rc) { LOGERR("missing global metadata for %s (gfid:%d)", filepath, gfid); } else { /* update local file metadata from global metadata */ - unifyfs_fid_update_file_meta(fid, &attr); + unifyfs_fid_update_file_meta(client, fid, &attr); } } } @@ -175,19 +231,26 @@ unifyfs_rc unifyfs_remove(unifyfs_handle fshdl, return (unifyfs_rc)EINVAL; } + unifyfs_client* client = fshdl; + + /* make sure requested file is within client namespace */ + if (!is_unifyfs_path(client, filepath)) { + return (unifyfs_rc)EINVAL; + } + unifyfs_rc ret = UNIFYFS_SUCCESS; /* invoke unlink rpc */ int gfid = unifyfs_generate_gfid(filepath); - int rc = invoke_client_unlink_rpc(gfid); + int rc = invoke_client_unlink_rpc(client, gfid); if (rc != UNIFYFS_SUCCESS) { ret = rc; } /* clean up the local state for this file (if any) */ - int fid = unifyfs_fid_from_gfid(gfid); + int fid = unifyfs_fid_from_gfid(client, gfid); if (-1 != fid) { - rc = unifyfs_fid_delete(fid); + rc = unifyfs_fid_delete(client, fid); if (rc != UNIFYFS_SUCCESS) { /* released storage for file, but failed to release * structures tracking storage, again bail out to keep diff --git a/client/src/unifyfs_api_internal.h b/client/src/unifyfs_api_internal.h index b2040591e..6eecff3b4 100644 --- a/client/src/unifyfs_api_internal.h +++ b/client/src/unifyfs_api_internal.h @@ -15,23 +15,114 @@ #ifndef UNIFYFS_API_INTERNAL_H #define UNIFYFS_API_INTERNAL_H +// client headers #include "unifyfs_api.h" #include "unifyfs-internal.h" -#include "unifyfs-fixed.h" -// client-server rpc headers -#include "unifyfs_client_rpcs.h" -#include "unifyfs_rpc_util.h" -#include "margo_client.h" +// common headers +#include "unifyfs_client.h" + +/* --- types and structures --- */ + +enum unifyfs_file_storage { + FILE_STORAGE_NULL = 0, + FILE_STORAGE_LOGIO +}; + +/* Client file metadata */ +typedef struct { + int fid; /* local file index in filemetas array */ + int storage; /* FILE_STORAGE type */ + + int needs_sync; /* have unsynced writes */ + struct seg_tree extents_sync; /* Segment tree containing our coalesced + * writes between sync operations */ + struct seg_tree extents; /* Segment tree of all local data extents */ + + unifyfs_file_attr_t attrs; /* UnifyFS and POSIX file attributes */ +} unifyfs_filemeta_t; + +/* struct used to map a full path to its local file id, + * an array of these is kept and a simple linear search + * is used to find a match */ +typedef struct { + /* flag incidating whether slot is in use */ + int in_use; + + /* full path and name of file */ + const char filename[UNIFYFS_MAX_FILENAME]; +} unifyfs_filename_t; /* UnifyFS file system client structure */ typedef struct unifyfs_client { - int app_id; /* application id (gfid for mountpoint) */ - int client_id; /* client id within application */ + unifyfs_client_state state; + + /* mountpoint configuration */ + unifyfs_cfg_t cfg; /* user-provided configuration */ + + bool use_local_extents; /* enable tracking of local extents */ + bool use_write_sync; /* sync for every write operation */ + bool use_unifyfs_magic; /* return UNIFYFS (true) or TMPFS (false) + * magic value from statfs() */ + + int max_files; /* max number of files to store */ + + size_t write_index_size; /* size of metadata log */ + size_t max_write_index_entries; /* max metadata log entries */ - bool is_mounted; /* has client mounted? */ + /* tracks current working directory within namespace */ + char* cwd; + + /* an arraylist to maintain the active mread requests for the client */ + arraylist_t* active_mreads; + unsigned int mread_id_generator; /* to generate unique mread ids */ + + /* an arraylist to maintain the active transfer requests for the client */ + arraylist_t* active_transfers; + unsigned int transfer_id_generator; /* to generate unique transfer ids */ + + /* per-file metadata */ + void* free_fid_stack; + unifyfs_filename_t* unifyfs_filelist; + unifyfs_filemeta_t* unifyfs_filemetas; - unifyfs_cfg_t cfg; /* client configuration */ } unifyfs_client; +/* Client initialization and finalization methods */ +int unifyfs_client_init(unifyfs_client* client); +int unifyfs_client_fini(unifyfs_client* client); + +/* find client with given app_id and client_id */ +unifyfs_client* unifyfs_find_client(int app_id, + int client_id, + int* list_position); + +/* lock/unlock access to shared data structures in client superblock */ +int unifyfs_stack_lock(unifyfs_client* client); +int unifyfs_stack_unlock(unifyfs_client* client); + +/* set global file metadata */ +int unifyfs_set_global_file_meta(unifyfs_client* client, + int gfid, + unifyfs_file_attr_op_e op, + unifyfs_file_attr_t* gfattr); + +/* get global file metadata */ +int unifyfs_get_global_file_meta(unifyfs_client* client, + int gfid, + unifyfs_file_attr_t* gfattr); + +/* sync all writes for client files with the server */ +int unifyfs_sync_files(unifyfs_client* client); + +/* get current file size. if we have a local file corresponding to the + * given gfid, we use the local metadata. otherwise, we use a global + * metadata lookup */ +off_t unifyfs_gfid_filesize(unifyfs_client* client, + int gfid); + +/* check if client mountpoint is prefix of given filepath */ +bool is_unifyfs_path(unifyfs_client* client, + const char* filepath); + #endif // UNIFYFS_API_INTERNAL_H diff --git a/client/src/unifyfs_api_io.c b/client/src/unifyfs_api_io.c index e0df90d00..45ba123f7 100644 --- a/client/src/unifyfs_api_io.c +++ b/client/src/unifyfs_api_io.c @@ -22,7 +22,8 @@ * Private Methods */ -static int process_gfid_writes(unifyfs_io_request* wr_reqs, +static int process_gfid_writes(unifyfs_client* client, + unifyfs_io_request* wr_reqs, size_t n_reqs) { int ret = UNIFYFS_SUCCESS; @@ -31,7 +32,7 @@ static int process_gfid_writes(unifyfs_io_request* wr_reqs, for (i = 0; i < n_reqs; i++) { unifyfs_io_request* req = wr_reqs + i; - int fid = unifyfs_fid_from_gfid(req->gfid); + int fid = unifyfs_fid_from_gfid(client, req->gfid); if (-1 == fid) { req->state = UNIFYFS_IOREQ_STATE_COMPLETED; req->result.error = EINVAL; @@ -48,7 +49,7 @@ static int process_gfid_writes(unifyfs_io_request* wr_reqs, } /* write user buffer to file */ - int rc = unifyfs_fid_write(fid, req->offset, req->user_buf, + int rc = unifyfs_fid_write(client, fid, req->offset, req->user_buf, req->nbytes, &(req->result.count)); if (rc != UNIFYFS_SUCCESS) { req->result.error = rc; @@ -65,7 +66,8 @@ static int process_gfid_writes(unifyfs_io_request* wr_reqs, return ret; } -static int process_gfid_truncates(unifyfs_io_request* tr_reqs, +static int process_gfid_truncates(unifyfs_client* client, + unifyfs_io_request* tr_reqs, size_t n_reqs) { int ret = UNIFYFS_SUCCESS; @@ -74,13 +76,13 @@ static int process_gfid_truncates(unifyfs_io_request* tr_reqs, for (i = 0; i < n_reqs; i++) { unifyfs_io_request* req = tr_reqs + i; - int fid = unifyfs_fid_from_gfid(req->gfid); + int fid = unifyfs_fid_from_gfid(client, req->gfid); if (-1 == fid) { req->state = UNIFYFS_IOREQ_STATE_COMPLETED; req->result.error = EINVAL; } - int rc = unifyfs_fid_truncate(fid, req->offset); + int rc = unifyfs_fid_truncate(client, fid, req->offset); if (rc != UNIFYFS_SUCCESS) { req->result.error = rc; } @@ -90,7 +92,8 @@ static int process_gfid_truncates(unifyfs_io_request* tr_reqs, return ret; } -static int process_gfid_syncs(unifyfs_io_request* s_reqs, +static int process_gfid_syncs(unifyfs_client* client, + unifyfs_io_request* s_reqs, size_t n_reqs) { int ret = UNIFYFS_SUCCESS; @@ -101,13 +104,13 @@ static int process_gfid_syncs(unifyfs_io_request* s_reqs, unifyfs_io_request* req = s_reqs + i; if (req->op == UNIFYFS_IOREQ_OP_SYNC_META) { - int fid = unifyfs_fid_from_gfid(req->gfid); + int fid = unifyfs_fid_from_gfid(client, req->gfid); if (-1 == fid) { req->state = UNIFYFS_IOREQ_STATE_COMPLETED; req->result.error = EINVAL; } - rc = unifyfs_fid_sync(fid); + rc = unifyfs_fid_sync_extents(client, fid); if (rc != UNIFYFS_SUCCESS) { req->result.error = rc; } @@ -115,7 +118,7 @@ static int process_gfid_syncs(unifyfs_io_request* s_reqs, } else if (req->op == UNIFYFS_IOREQ_OP_SYNC_DATA) { /* logio_sync covers all files' data - only do it once */ if (!data_sync_completed) { - rc = unifyfs_logio_sync(logio_ctx); + rc = unifyfs_logio_sync(client->state.logio_ctx); if (UNIFYFS_SUCCESS != rc) { req->result.error = rc; } else { @@ -149,6 +152,8 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, return EINVAL; } + unifyfs_client* client = fshdl; + unifyfs_io_request* req; /* determine counts of various operations */ @@ -260,7 +265,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, } /* process reads */ - int rc = process_gfid_reads(rd_reqs, (int)n_read); + int rc = process_gfid_reads(client, rd_reqs, (int)n_read); if (rc != UNIFYFS_SUCCESS) { /* error encountered while issuing reads */ for (i = 0; i < n_read; i++) { @@ -270,7 +275,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, } /* process writes */ - rc = process_gfid_writes(wr_reqs, n_write); + rc = process_gfid_writes(client, wr_reqs, n_write); if (rc != UNIFYFS_SUCCESS) { /* error encountered while issuing writes */ for (i = 0; i < n_write; i++) { @@ -280,7 +285,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, } /* process truncates */ - rc = process_gfid_truncates(tr_reqs, n_trunc); + rc = process_gfid_truncates(client, tr_reqs, n_trunc); if (rc != UNIFYFS_SUCCESS) { /* error encountered while issuing writes */ for (i = 0; i < n_trunc; i++) { @@ -290,7 +295,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, } /* process syncs */ - rc = process_gfid_syncs(s_reqs, n_sync); + rc = process_gfid_syncs(client, s_reqs, n_sync); if (rc != UNIFYFS_SUCCESS) { /* error encountered while issuing writes */ for (i = 0; i < n_sync; i++) { diff --git a/client/src/unifyfs_api_transfer.c b/client/src/unifyfs_api_transfer.c index 558c3de33..15c4b82ff 100644 --- a/client/src/unifyfs_api_transfer.c +++ b/client/src/unifyfs_api_transfer.c @@ -16,11 +16,6 @@ #include "unifyfs_api_internal.h" #include "client_transfer.h" -/* this avoids a #include of */ -extern int unifyfs_transfer_file(const char* src, - const char* dst, - int parallel); - /* * Public Methods */ @@ -37,51 +32,13 @@ unifyfs_rc unifyfs_dispatch_transfer(unifyfs_handle fshdl, if (nreqs == 0) { return UNIFYFS_SUCCESS; } else if (NULL == reqs) { + /* non-zero req count, but NULL reqs pointer */ return EINVAL; } - unifyfs_transfer_request* req; - for (size_t i = 0; i < nreqs; i++) { - req = reqs + i; - req->state = UNIFYFS_IOREQ_STATE_IN_PROGRESS; - - /* check for a valid transfer mode */ - switch (req->mode) { - case UNIFYFS_TRANSFER_MODE_COPY: - case UNIFYFS_TRANSFER_MODE_MOVE: - break; - default: - req->result.error = EINVAL; - req->result.rc = UNIFYFS_FAILURE; - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; - continue; - } - - int rc = unifyfs_transfer_file(req->src_path, req->dst_path, - req->use_parallel); - if (rc) { - /* unifyfs_transfer_file() returns a negative error code */ - req->result.error = -rc; - req->result.rc = UNIFYFS_FAILURE; - } else { - req->result.error = 0; - req->result.rc = UNIFYFS_SUCCESS; - - if (req->mode == UNIFYFS_TRANSFER_MODE_MOVE) { - /* successful copy, now remove source */ - errno = 0; - rc = unlink(req->src_path); - if (rc) { - req->result.error = errno; - req->result.rc = UNIFYFS_FAILURE; - } - } - } - - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; - } - - return UNIFYFS_SUCCESS; + unifyfs_client* client = fshdl; + size_t n_reqs = nreqs; + return client_submit_transfers(client, reqs, n_reqs); } /* Cancel an array of transfer requests */ @@ -99,6 +56,16 @@ unifyfs_rc unifyfs_cancel_transfer(unifyfs_handle fshdl, return EINVAL; } + for (size_t i = 0; i < nreqs; i++) { + unifyfs_transfer_request* req = reqs + i; + if (req->state != UNIFYFS_IOREQ_STATE_COMPLETED) { + req->state = UNIFYFS_IOREQ_STATE_CANCELED; + + /* TODO: cancel the transfer */ + } + } + + /* not actually canceling the transfer yet */ return UNIFYFS_ERROR_NYI; } @@ -118,14 +85,28 @@ unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, return EINVAL; } + unifyfs_client* client = fshdl; + unifyfs_transfer_request* req; + client_transfer_status* transfer; size_t i, n_done; - while (1) { + int max_loop = 30000; + int loop_cnt = 0; + do { n_done = 0; for (i = 0; i < nreqs; i++) { - unifyfs_transfer_request* req = reqs + i; - if ((req->state == UNIFYFS_IOREQ_STATE_CANCELED) || - (req->state == UNIFYFS_IOREQ_STATE_COMPLETED)) { + req = reqs + i; + transfer = client_get_transfer(client, req->_reqid); + if ((NULL != transfer) && + client_check_transfer_complete(transfer)) { + LOGDBG("checked - complete"); + n_done++; + client_cleanup_transfer(client, transfer); + } else if ((req->state == UNIFYFS_IOREQ_STATE_CANCELED) || + (req->state == UNIFYFS_IOREQ_STATE_COMPLETED)) { + /* this handles the case where we have already cleaned the + * transfer status in a prior loop iteration */ n_done++; + LOGDBG("state - complete"); } } if (waitall) { @@ -137,7 +118,17 @@ unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, /* at least one req is done */ break; } + + /* TODO: we probably need a timeout mechanism to prevent an infinite + * loop when something goes wrong and the transfer status never + * gets updated. For now, just using a hardcoded maximum loop + * iteration count that roughly equates to 30 sec */ + loop_cnt++; usleep(1000); /* sleep 1 ms */ + } while (loop_cnt < max_loop); + + if (loop_cnt == max_loop) { + return ETIMEDOUT; } return UNIFYFS_SUCCESS; diff --git a/client/src/unifyfs_fid.c b/client/src/unifyfs_fid.c new file mode 100644 index 000000000..d1710bdfb --- /dev/null +++ b/client/src/unifyfs_fid.c @@ -0,0 +1,1145 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "unifyfs_fid.h" +#include "margo_client.h" + +/* --------------------------------------- + * fid stack management + * --------------------------------------- */ + +/* allocate a file id slot for a new file + * return the fid or -1 on error */ +int unifyfs_fid_alloc(unifyfs_client* client) +{ + unifyfs_stack_lock(client); + int fid = unifyfs_stack_pop(client->free_fid_stack); + unifyfs_stack_unlock(client); + LOGDBG("unifyfs_stack_pop() gave %d", fid); + if (fid < 0) { + /* need to create a new file, but we can't */ + LOGERR("unifyfs_stack_pop() failed (%d)", fid); + return -EMFILE; + } + return fid; +} + +/* return the file id back to the free pool */ +int unifyfs_fid_free(unifyfs_client* client, + int fid) +{ + unifyfs_stack_lock(client); + unifyfs_stack_push(client->free_fid_stack, fid); + unifyfs_stack_unlock(client); + return UNIFYFS_SUCCESS; +} + +/* --------------------------------------- + * fid metadata update operations + * --------------------------------------- */ + +/* allocate and initialize data management resource for file */ +static int fid_store_alloc(unifyfs_client* client, + int fid) +{ + /* get meta data for this file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if ((meta != NULL) && (meta->fid == fid)) { + /* Initialize our segment tree that will record our writes */ + int rc = seg_tree_init(&meta->extents_sync); + if (rc != 0) { + return UNIFYFS_FAILURE; + } + + /* Initialize our segment tree to track extents for all writes + * by this process, can be used to read back local data */ + if (client->use_local_extents) { + rc = seg_tree_init(&meta->extents); + if (rc != 0) { + /* clean up extents_sync tree we initialized */ + seg_tree_destroy(&meta->extents_sync); + return UNIFYFS_FAILURE; + } + } + + /* indicate that we're using LOGIO to store data for this file */ + meta->storage = FILE_STORAGE_LOGIO; + + return UNIFYFS_SUCCESS; + } else { + LOGERR("failed to get filemeta for fid=%d", fid); + } + + return UNIFYFS_FAILURE; +} + +/* free data management resource for file */ +static int fid_storage_free(unifyfs_client* client, + int fid) +{ + /* get meta data for this file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if ((meta != NULL) && (meta->fid == fid)) { + if (meta->storage == FILE_STORAGE_LOGIO) { + /* Free our write seg_tree */ + seg_tree_destroy(&meta->extents_sync); + + /* Free our extent seg_tree */ + if (client->use_local_extents) { + seg_tree_destroy(&meta->extents); + } + } + + /* set storage type back to NULL */ + meta->storage = FILE_STORAGE_NULL; + + return UNIFYFS_SUCCESS; + } + + return UNIFYFS_FAILURE; +} + + +/* Update local metadata for file from global metadata */ +int unifyfs_fid_update_file_meta(unifyfs_client* client, + int fid, + unifyfs_file_attr_t* gfattr) +{ + if (NULL == gfattr) { + return EINVAL; + } + + /* lookup local metadata for file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if (meta != NULL) { + meta->attrs = *gfattr; + return UNIFYFS_SUCCESS; + } + + /* else, bad fid */ + return EINVAL; +} + +/* + * Set the global metadata values for a file using local file + * attributes associated with the given local file id. + * + * fid: The local file id on which to base global metadata values. + * + * op: If set to FILE_ATTR_OP_CREATE, attempt to create the file first. + * If the file already exists, then update its metadata with the values + * from fid filemeta. If not creating and the file does not exist, + * then the server will return an error. + */ +int unifyfs_set_global_file_meta_from_fid(unifyfs_client* client, + int fid, + unifyfs_file_attr_op_e op) +{ + /* initialize an empty file attributes structure */ + unifyfs_file_attr_t fattr; + unifyfs_file_attr_set_invalid(&fattr); + + /* lookup local metadata for file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + assert(meta != NULL); + + /* set global file id */ + fattr.gfid = meta->attrs.gfid; + + LOGDBG("setting global file metadata for fid:%d gfid:%d path:%s", + fid, fattr.gfid, meta->attrs.filename); + + unifyfs_file_attr_update(op, &fattr, &(meta->attrs)); + + LOGDBG("using following attributes"); + debug_print_file_attr(&fattr); + + /* submit file attributes to global key/value store */ + int ret = unifyfs_set_global_file_meta(client, fattr.gfid, op, &fattr); + return ret; +} + +/* add a new file and initialize metadata + * returns the new fid, or negative value on error */ +int unifyfs_fid_create_file(unifyfs_client* client, + const char* path, + int exclusive) +{ + /* check that pathname is within bounds */ + size_t pathlen = strlen(path) + 1; + if (pathlen > UNIFYFS_MAX_FILENAME) { + return -ENAMETOOLONG; + } + + /* allocate an id for this file */ + int fid = unifyfs_fid_alloc(client); + if (fid < 0) { + return fid; + } + + /* mark this slot as in use */ + client->unifyfs_filelist[fid].in_use = 1; + + /* copy file name into slot */ + strlcpy((void*)&client->unifyfs_filelist[fid].filename, path, + UNIFYFS_MAX_FILENAME); + LOGDBG("Filename %s got unifyfs fid %d", + client->unifyfs_filelist[fid].filename, fid); + + /* get metadata for this file id */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + assert(meta != NULL); + + /* initialize file attributes */ + unifyfs_file_attr_set_invalid(&(meta->attrs)); + meta->attrs.gfid = unifyfs_generate_gfid(path); + meta->attrs.size = 0; + meta->attrs.mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; + meta->attrs.is_laminated = 0; + meta->attrs.is_shared = !exclusive; + meta->attrs.filename = (char*)&(client->unifyfs_filelist[fid].filename); + + /* use client user/group */ + meta->attrs.uid = getuid(); + meta->attrs.gid = getgid(); + + /* use current time for atime/mtime/ctime */ + struct timespec tp = {0}; + clock_gettime(CLOCK_REALTIME, &tp); + meta->attrs.atime = tp; + meta->attrs.mtime = tp; + meta->attrs.ctime = tp; + + /* set UnifyFS client metadata */ + meta->fid = fid; + meta->storage = FILE_STORAGE_NULL; + meta->needs_sync = 0; + + return fid; +} + +/* create directory state for given path. returns success|error */ +int unifyfs_fid_create_directory(unifyfs_client* client, + const char* path) +{ + /* check that pathname is within bounds */ + size_t pathlen = strlen(path) + 1; + if (pathlen > UNIFYFS_MAX_FILENAME) { + return ENAMETOOLONG; + } + + /* get local and global file ids */ + int fid = unifyfs_fid_from_path(client, path); + int gfid = unifyfs_generate_gfid(path); + + /* test whether we have info for file in our local file list */ + int found_local = (fid != -1); + + /* test whether we have metadata for file in global key/value store */ + int found_global = 0; + unifyfs_file_attr_t gfattr = { 0 }; + int rc = unifyfs_get_global_file_meta(client, gfid, &gfattr); + if (UNIFYFS_SUCCESS == rc) { + found_global = 1; + } + + if (found_local && !found_global) { + /* exists locally, but not globally + * + * FIXME: so, we have detected the cache inconsistency here. + * we cannot simply unlink or remove the entry because then we also + * need to check whether any subdirectories or files exist. + * + * this can happen when + * - a process created a directory. this process (A) has opened it at + * least once. + * - then, the directory has been deleted by another process (B). it + * deletes the global entry without checking any local used entries + * in other processes. + * + * we currently return EEXIST, and this needs to be addressed according + * to a consistency model this fs intance assumes. + */ + return EEXIST; + } + + /* now, we need to create a new directory. we reuse the file creation + * method and then update the mode to indicate it's a directory */ + if (!found_local) { + /* create a new file */ + fid = unifyfs_fid_create_file(client, path, 0); + if (fid < 0) { + /* convert negative error code to positive */ + return -fid; + } + + /* mark it as a directory */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + assert(meta != NULL); + meta->attrs.mode = (meta->attrs.mode & ~S_IFREG) | S_IFDIR; + + if (!found_global) { + /* insert global meta data for directory */ + unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; + rc = unifyfs_set_global_file_meta_from_fid(client, fid, op); + if (rc != UNIFYFS_SUCCESS) { + if (rc != EEXIST) { + LOGERR("Failed to add global metadata for dir %s (rc=%d)", + path, rc); + return rc; + } /* else, someone else created global metadata first */ + } + } + } + + return UNIFYFS_SUCCESS; +} + +/* delete a file id, free its local storage resources and return + * the file id to free stack */ +int unifyfs_fid_delete(unifyfs_client* client, + int fid) +{ + /* finalize the storage we're using for this file */ + int rc = fid_storage_free(client, fid); + if (rc != UNIFYFS_SUCCESS) { + /* failed to release structures tracking storage, + * bail out to keep its file id active */ + return rc; + } + + /* set this file id as not in use */ + client->unifyfs_filelist[fid].in_use = 0; + + /* add this id back to the free stack */ + rc = unifyfs_fid_free(client, fid); + if (rc != UNIFYFS_SUCCESS) { + /* storage for the file was released, but we hit + * an error while freeing the file id */ + return rc; + } + + return UNIFYFS_SUCCESS; +} + +/* opens a new file id with specified path, access flags, and permissions, + * fills outfid with file id and outpos with position for current file pointer, + * returns UNIFYFS error code + */ +int unifyfs_fid_open( + unifyfs_client* client, + const char* path, /* path of file to be opened */ + int flags, /* flags bits as from open(2) */ + mode_t mode, /* mode bits as from open(2) */ + int* outfid, /* allocated local file id if open is successful */ + off_t* outpos) /* initial file position if open is successful */ +{ + int ret; + + /* set the pointer to the start of the file */ + off_t pos = 0; + + /* check that pathname is within bounds */ + size_t pathlen = strlen(path) + 1; + if (pathlen > UNIFYFS_MAX_FILENAME) { + return ENAMETOOLONG; + } + + /* + * TODO: The test of file existence involves both local and global checks. + * However, the testing below does not seem to cover all cases. For + * instance, a globally unlinked file might be still cached locally because + * the broadcast for cache invalidation has not been implemented, yet. + */ + + /* look for local and global file ids */ + int fid = unifyfs_fid_from_path(client, path); + int gfid = unifyfs_generate_gfid(path); + LOGDBG("unifyfs_fid_from_path() gave %d (gfid = %d)", fid, gfid); + + /* test whether we have info for file in our local file list */ + int found_local = (fid >= 0); + + /* determine whether any write flags are specified */ + int open_for_write = flags & (O_RDWR | O_WRONLY); + + int exclusive = flags & O_EXCL; + + /* struct to hold global metadata for file */ + unifyfs_file_attr_t gfattr = { 0, }; + + /* if O_CREAT, + * if not local, allocate fid and storage + * create from local fid meta + * attempt to create global inode + * if EEXIST and O_EXCL, error and release fid/storage + * lookup global meta + * check that local and global info are consistent + * if O_TRUNC and not laminated, truncate + * else + * lookup global meta + * if not found, error + * check that local and global info are consistent + * if O_APPEND, set pos to file size + */ + + /* flag indicating whether file should be truncated */ + int need_truncate = 0; + + /* determine whether we are creating a new file + * or opening an existing one */ + if (flags & O_CREAT) { + /* user wants to create a new file, + * allocate a local file id structure if needed */ + if (!found_local) { + /* initialize local metadata for this file */ + fid = unifyfs_fid_create_file(client, path, exclusive); + if (fid < 0) { + LOGERR("failed to create a new file %s", path); + return -fid; + } + + /* initialize local storage for this file */ + ret = fid_store_alloc(client, fid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to allocate storage space for file %s (fid=%d)", + path, fid); + unifyfs_fid_delete(client, fid); + return ret; + } + + /* TODO: set meta->mode bits to mode variable */ + } + + /* insert file attribute for file in key-value store */ + unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; + ret = unifyfs_set_global_file_meta_from_fid(client, fid, op); + if (ret == EEXIST && !exclusive) { + /* File didn't exist before, but now it does. + * Another process beat us to the punch in creating it. + * Read its metadata to update our cache. */ + ret = unifyfs_get_global_file_meta(client, gfid, &gfattr); + if (ret == UNIFYFS_SUCCESS) { + if (found_local) { + /* TODO: check that global metadata is consistent with + * our existing local entry */ + } + + /* Successful in fetching metadata for existing file. + * Update our local cache using that metadata. */ + unifyfs_fid_update_file_meta(client, fid, &gfattr); + } else { + /* Failed to get metadata for a file that should exist. + * Perhaps it was since deleted. We could try to create + * it again and loop through these steps, but for now + * consider this situation to be an error. */ + LOGERR("Failed to get metadata on existing file %s (fid:%d)", + path, fid); + } + + /* check for truncate if the file exists already */ + if ((flags & O_TRUNC) && open_for_write && !gfattr.is_laminated) { + need_truncate = 1; + } + } + if (ret != UNIFYFS_SUCCESS) { + LOGERR("Failed to populate the global meta entry for %s (fid:%d)", + path, fid); + if (!found_local) { + /* free fid we just allocated above, + * but don't do that by calling fid_unlink */ + unifyfs_fid_delete(client, fid); + } + return ret; + } + } else { + /* trying to open without creating, file must already exist, + * lookup global metadata for file */ + ret = unifyfs_get_global_file_meta(client, gfid, &gfattr); + if (ret != UNIFYFS_SUCCESS) { + /* bail out if we failed to find global file */ + if (found_local && ret == ENOENT) { + /* Have a local entry, but there is no global entry. + * Perhaps global file was unlinked? + * Invalidate our local entry. */ + LOGDBG("file found locally, but seems to be deleted globally. " + "invalidating the local cache."); + unifyfs_fid_delete(client, fid); + } + + return ret; + } + + /* succeeded in global lookup for file, + * allocate a local file id structure if needed */ + if (!found_local) { + /* initialize local metadata for this file */ + fid = unifyfs_fid_create_file(client, path, 0); + if (fid < 0) { + LOGERR("failed to create a new file %s", path); + return -fid; + } + + /* initialize local storage for this file */ + ret = fid_store_alloc(client, fid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to allocate storage space for file %s (fid=%d)", + path, fid); + /* free fid we just allocated above, + * but don't do that by calling fid_unlink */ + unifyfs_fid_delete(client, fid); + return ret; + } + } else { + /* TODO: already have a local entry for this path and found + * a global entry, check that they are consistent */ + } + + /* Successful in fetching metadata for existing file. + * Update our local cache using that metadata. */ + unifyfs_fid_update_file_meta(client, fid, &gfattr); + + /* check if we need to truncate the existing file */ + if ((flags & O_TRUNC) && open_for_write && !gfattr.is_laminated) { + need_truncate = 1; + } + } + + /* if given O_DIRECTORY, the named file must be a directory */ + if ((flags & O_DIRECTORY) && !unifyfs_fid_is_dir(client, fid)) { + if (!found_local) { + /* free fid we just allocated above, + * but don't do that by calling fid_unlink */ + unifyfs_fid_delete(client, fid); + } + return ENOTDIR; + } + + /* TODO: does O_DIRECTORY really have to be given to open a directory? */ + if (!(flags & O_DIRECTORY) && unifyfs_fid_is_dir(client, fid)) { + if (!found_local) { + /* free fid we just allocated above, + * but don't do that by calling fid_unlink */ + unifyfs_fid_delete(client, fid); + } + return EISDIR; + } + + /* + * Catch any case where we could potentially want to write to a laminated + * file. + */ + if (gfattr.is_laminated && + ((flags & (O_CREAT | O_TRUNC | O_APPEND | O_WRONLY)) || + ((mode & 0222) && (flags != O_RDONLY)))) { + LOGDBG("Can't open laminated file %s with a writable flag.", path); + /* TODO: free fid we just allocated above, + * but don't do that by calling fid_unlink */ + if (!found_local) { + /* free fid we just allocated above, + * but don't do that by calling fid_unlink */ + unifyfs_fid_delete(client, fid); + } + return EROFS; + } + + /* truncate the file, if we have to */ + if (need_truncate) { + ret = unifyfs_fid_truncate(client, fid, 0); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("Failed to truncate the file %s", path); + return ret; + } + } + + /* do we normally update position to EOF with O_APPEND? */ + if ((flags & O_APPEND) && open_for_write) { + /* We only support O_APPEND on non-laminated files */ + pos = unifyfs_fid_logical_size(client, fid); + } + + /* return local file id and starting file position */ + *outfid = fid; + *outpos = pos; + + return UNIFYFS_SUCCESS; +} + +int unifyfs_fid_close(unifyfs_client* client, + int fid) +{ + /* TODO: clear any held locks */ + + /* nothing to do here, just a place holder */ + return UNIFYFS_SUCCESS; +} + +/* unlink file and then delete its associated state */ +int unifyfs_fid_unlink(unifyfs_client* client, + int fid) +{ + int rc; + + /* invoke unlink rpc */ + int gfid = unifyfs_gfid_from_fid(client, fid); + rc = invoke_client_unlink_rpc(client, gfid); + if (rc != UNIFYFS_SUCCESS) { + /* TODO: if item does not exist globally, but just locally, + * we still want to delete item locally */ + return rc; + } + + /* finalize the storage we're using for this file */ + rc = unifyfs_fid_delete(client, fid); + if (rc != UNIFYFS_SUCCESS) { + /* released storage for file, but failed to release + * structures tracking storage, again bail out to keep + * its file id active */ + return rc; + } + + return UNIFYFS_SUCCESS; +} + + +/* --------------------------------------- + * fid metadata query operations + * --------------------------------------- */ + +/* given a file id, return a pointer to the meta data, + * otherwise return NULL */ +unifyfs_filemeta_t* unifyfs_get_meta_from_fid(unifyfs_client* client, + int fid) +{ + /* check that the file id is within range of our array */ + if (fid >= 0 && fid < client->max_files) { + /* get a pointer to the file meta data structure */ + unifyfs_filemeta_t* meta = &(client->unifyfs_filemetas[fid]); + return meta; + } + return NULL; +} + +/* given a file id, return 1 if file is laminated, 0 otherwise */ +int unifyfs_fid_is_laminated(unifyfs_client* client, + int fid) +{ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if ((meta != NULL) && (meta->fid == fid)) { + return meta->attrs.is_laminated; + } + return 0; +} + +/* checks to see if fid is a directory + * returns 1 for yes + * returns 0 for no */ +int unifyfs_fid_is_dir(unifyfs_client* client, + int fid) +{ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if ((meta != NULL) && (meta->attrs.mode & S_IFDIR)) { + return 1; + } + return 0; +} + +/* checks to see if a directory is empty + * assumes that check for is_dir has already been made + * only checks for full path matches, does not check relative paths, + * e.g. ../dirname will not work + * returns 1 for yes it is empty + * returns 0 for no */ +int unifyfs_fid_is_dir_empty(unifyfs_client* client, + const char* path) +{ + int i = 0; + while (i < client->max_files) { + /* only check this element if it's active */ + if (client->unifyfs_filelist[i].in_use) { + /* if the file starts with the path, it is inside of that directory + * also check that it's not the directory entry itself */ + char* strptr = strstr(path, client->unifyfs_filelist[i].filename); + if (strptr == client->unifyfs_filelist[i].filename && + strcmp(path, client->unifyfs_filelist[i].filename) != 0) { + /* found a child item in path */ + LOGDBG("File found: unifyfs_filelist[%d].filename = %s", + i, (char*)&client->unifyfs_filelist[i].filename); + return 0; + } + } + + /* go on to next file */ + i++; + } + + /* couldn't find any files with this prefix, dir must be empty */ + return 1; +} + +int unifyfs_gfid_from_fid(unifyfs_client* client, + int fid) +{ + /* check that local file id is in range */ + if (fid < 0 || fid >= client->max_files) { + return -1; + } + + /* return global file id, cached in file meta struct */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if (meta != NULL) { + return meta->attrs.gfid; + } else { + return -1; + } +} + +/* scan list of files and return fid corresponding to target gfid, + * returns -1 if not found */ +int unifyfs_fid_from_gfid(unifyfs_client* client, + int gfid) +{ + int i; + for (i = 0; i < client->max_files; i++) { + if (client->unifyfs_filelist[i].in_use && + client->unifyfs_filemetas[i].attrs.gfid == gfid) { + /* found a file id that's in use and it matches + * the target fid, this is the one */ + return i; + } + } + return -1; +} + +/* Given a fid, return the path. */ +const char* unifyfs_path_from_fid(unifyfs_client* client, + int fid) +{ + unifyfs_filename_t* fname = &client->unifyfs_filelist[fid]; + if (fname->in_use) { + return fname->filename; + } + return NULL; +} + +/* Given a path, return the local file id, or -1 if not found */ +int unifyfs_fid_from_path(unifyfs_client* client, + const char* path) +{ + /* scan through active entries in filelist array looking + * for a match of path */ + int i = 0; + while (i < client->max_files && client->unifyfs_filelist[i].in_use) { + const char* filename = client->unifyfs_filelist[i].filename; + if (0 == strcmp(filename, path)) { + LOGDBG("File found: unifyfs_filelist[%d].filename = %s", + i, (char*)filename); + return i; + } + i++; + } + + /* couldn't find specified path */ + return -1; +} + +/* Return the global (laminated) size of the file */ +off_t unifyfs_fid_global_size(unifyfs_client* client, + int fid) +{ + /* get meta data for this file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if (meta != NULL) { + return meta->attrs.size; + } + return (off_t)-1; +} + +/* + * Return the size of the file. If the file is laminated, return the + * laminated size. If the file is not laminated, return the local + * size. + */ +off_t unifyfs_fid_logical_size(unifyfs_client* client, + int fid) +{ + /* get meta data for this file */ + if (unifyfs_fid_is_laminated(client, fid)) { + off_t size = unifyfs_fid_global_size(client, fid); + return size; + } else { + /* invoke an rpc to ask the server what the file size is */ + + /* sync any writes to disk before requesting file size */ + unifyfs_fid_sync_extents(client, fid); + + /* get file size for this file */ + size_t filesize; + int gfid = unifyfs_gfid_from_fid(client, fid); + int ret = invoke_client_filesize_rpc(client, gfid, &filesize); + if (ret != UNIFYFS_SUCCESS) { + /* failed to get file size */ + return (off_t)-1; + } + return (off_t)filesize; + } +} + + +/* ======================================= + * I/O operations on fids + * ======================================= */ + +/* Find write extents that span or exceed truncation offset and remove them */ +static int fid_truncate_write_meta(unifyfs_client* client, + unifyfs_filemeta_t* meta, + off_t trunc_sz) +{ + if (0 == trunc_sz) { + /* All writes should be removed. Clear extents_sync */ + seg_tree_clear(&meta->extents_sync); + + if (client->use_local_extents) { + /* Clear the local extent cache too */ + seg_tree_clear(&meta->extents); + } + return UNIFYFS_SUCCESS; + } + + unsigned long trunc_off = (unsigned long) trunc_sz; + int rc = seg_tree_remove(&meta->extents_sync, trunc_off, ULONG_MAX); + if (client->use_local_extents) { + rc = seg_tree_remove(&meta->extents, trunc_off, ULONG_MAX); + } + if (rc) { + LOGERR("removal of write extents due to truncation failed"); + rc = UNIFYFS_FAILURE; + } else { + rc = UNIFYFS_SUCCESS; + } + return rc; +} + +/* truncate file id to given length, frees resources if length is + * less than size and allocates and zero-fills new bytes if length + * is more than size */ +int unifyfs_fid_truncate(unifyfs_client* client, + int fid, + off_t length) +{ + /* get meta data for this file */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + assert(meta != NULL); + + /* truncate is not valid for directories */ + if (S_ISDIR(meta->attrs.mode)) { + return EISDIR; + } + + if (meta->attrs.is_laminated) { + /* Can't truncate a laminated file */ + return EINVAL; + } + + if (meta->storage != FILE_STORAGE_LOGIO) { + /* unknown storage type */ + return EIO; + } + + /* remove/update writes past truncation size for this file id */ + int rc = fid_truncate_write_meta(client, meta, length); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* truncate is a sync point */ + rc = unifyfs_fid_sync_extents(client, fid); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* update global size in filemeta to reflect truncated size. + * note that log size is not affected */ + meta->attrs.size = length; + + /* invoke truncate rpc */ + int gfid = unifyfs_gfid_from_fid(client, fid); + rc = invoke_client_truncate_rpc(client, gfid, length); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + return UNIFYFS_SUCCESS; +} + +/* + * Clear all entries in the write log index. This only clears the metadata, + * not the data itself. + */ +static void clear_index(unifyfs_client* client) +{ + *(client->state.write_index.ptr_num_entries) = 0; +} + +/* Add the metadata for a single write to the index */ +static int add_write_meta_to_index(unifyfs_client* client, + unifyfs_filemeta_t* meta, + off_t file_pos, + off_t log_pos, + size_t length) +{ + /* add write extent to our segment trees */ + if (client->use_local_extents) { + /* record write extent in our local cache */ + seg_tree_add(&meta->extents, + file_pos, + file_pos + length - 1, + log_pos); + } + + /* + * We want to make sure this write will not overflow the maximum + * number of index entries we can sync with server. A write can at most + * create two new nodes in the seg_tree. If we're close to potentially + * filling up the index, sync it out. + */ + unsigned long count_before = seg_tree_count(&meta->extents_sync); + if (count_before >= (client->max_write_index_entries - 2)) { + /* this will flush our segments, sync them, and set the running + * segment count back to 0 */ + unifyfs_fid_sync_extents(client, meta->fid); + } + + /* store the write in our segment tree used for syncing with server. */ + seg_tree_add(&meta->extents_sync, + file_pos, + file_pos + length - 1, + log_pos); + + return UNIFYFS_SUCCESS; +} + +/* + * Remove all entries in the current index and re-write it using the write + * metadata stored in the target file's extents_sync segment tree. This only + * re-writes the metadata in the index. All the actual data is still kept + * in the write log and will be referenced correctly by the new metadata. + * + * After this function is done, 'state.write_index' will have been totally + * re-written. The writes in the index will be flattened, non-overlapping, + * and sequential. The extents_sync segment tree will be cleared. + * + * This function is called when we sync our extents with the server. + * + * Returns maximum write log offset for synced extents. + */ +static off_t rewrite_index_from_seg_tree(unifyfs_client* client, + unifyfs_filemeta_t* meta) +{ + /* get pointer to index buffer */ + unifyfs_index_t* indexes = client->state.write_index.index_entries; + + /* Erase the index before we re-write it */ + clear_index(client); + + /* count up number of entries we wrote to buffer */ + unsigned long idx = 0; + + /* record maximum write log offset */ + off_t max_log_offset = 0; + + int gfid = meta->attrs.gfid; + + seg_tree_rdlock(&meta->extents_sync); + /* For each write in this file's seg_tree ... */ + struct seg_tree_node* node = NULL; + while ((node = seg_tree_iter(&meta->extents_sync, node))) { + indexes[idx].file_pos = node->start; + indexes[idx].log_pos = node->ptr; + indexes[idx].length = node->end - node->start + 1; + indexes[idx].gfid = gfid; + idx++; + if ((off_t)(node->end) > max_log_offset) { + max_log_offset = (off_t) node->end; + } + } + seg_tree_unlock(&meta->extents_sync); + /* All done processing this files writes. Clear its seg_tree */ + seg_tree_clear(&meta->extents_sync); + + /* record total number of entries in index buffer */ + *(client->state.write_index.ptr_num_entries) = idx; + + return max_log_offset; +} + +/* Sync data for file to server if needed */ +int unifyfs_fid_sync_extents(unifyfs_client* client, + int fid) +{ + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if ((NULL == meta) || (meta->fid != fid)) { + /* bail out with an error if we fail to find it */ + LOGERR("missing filemeta for fid=%d", fid); + return UNIFYFS_FAILURE; + } + + /* sync with server if we need to */ + if (meta->needs_sync) { + int rc; + + /* write contents from segment tree to index buffer */ + rewrite_index_from_seg_tree(client, meta); + + /* if there are no index entries, we've got nothing to sync */ + if (*(client->state.write_index.ptr_num_entries) == 0) { + /* consider that we've sync'd successfully */ + meta->needs_sync = 0; + return UNIFYFS_SUCCESS; + } + + /* tell the server to grab our new extents */ + rc = invoke_client_sync_rpc(client, meta->attrs.gfid); + if (UNIFYFS_SUCCESS != rc) { + /* something went wrong when trying to flush extents */ + LOGERR("failed to flush write index to server for gfid=%d", + meta->attrs.gfid); + ret = rc; + } + + /* we've sync'd, so mark this file as being up-to-date */ + meta->needs_sync = 0; + + /* flushed, clear buffer and refresh number of entries + * and number remaining */ + clear_index(client); + } + + return ret; +} + + +/* Write data to file using log-based I/O. + * Return UNIFYFS_SUCCESS, or error code */ +static int fid_logio_write( + unifyfs_client* client, + unifyfs_filemeta_t* meta, /* meta data for file */ + off_t pos, /* file position to start writing at */ + const void* buf, /* user buffer holding data */ + size_t count, /* number of bytes to write */ + size_t* nwritten) /* returns number of bytes written */ +{ + /* assume we'll fail to write anything */ + *nwritten = 0; + + assert(meta != NULL); + int fid = meta->fid; + int gfid = meta->attrs.gfid; + if (meta->storage != FILE_STORAGE_LOGIO) { + LOGERR("file (fid=%d) storage mode != FILE_STORAGE_LOGIO", fid); + return EINVAL; + } + + /* allocate space in the log for this write */ + off_t log_off; + int rc = unifyfs_logio_alloc(client->state.logio_ctx, count, &log_off); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("logio_alloc(%zu) failed", count); + return rc; + } + + /* do the write */ + rc = unifyfs_logio_write(client->state.logio_ctx, log_off, count, + buf, nwritten); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("fid=%d gfid=%d logio_write(off=%zu, cnt=%zu) failed", + fid, gfid, log_off, count); + return rc; + } + + if (*nwritten < count) { + LOGWARN("partial logio_write() @ offset=%zu (%zu of %zu bytes)", + (size_t)log_off, *nwritten, count); + } else { + LOGDBG("fid=%d gfid=%d pos=%zu - successful logio_write() " + "@ log offset=%zu (%zu bytes)", + fid, gfid, (size_t)pos, (size_t)log_off, count); + } + + /* update our write metadata for this write */ + rc = add_write_meta_to_index(client, meta, pos, log_off, *nwritten); + return rc; +} + +/* Write count bytes from buf into file starting at offset pos. + * + * Returns UNIFYFS_SUCCESS, or an error code + */ +int unifyfs_fid_write( + unifyfs_client* client, + int fid, /* local file id to write to */ + off_t pos, /* starting position in file */ + const void* buf, /* buffer to be written */ + size_t count, /* number of bytes to write */ + size_t* nwritten) /* returns number of bytes written */ +{ + int rc; + + /* assume we won't write anything */ + *nwritten = 0; + + /* short-circuit a 0-byte write */ + if (count == 0) { + return UNIFYFS_SUCCESS; + } + + /* get meta for this file id */ + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + assert(meta != NULL); + + if (meta->attrs.is_laminated) { + /* attempt to write to laminated file, return read-only filesystem */ + return EROFS; + } + + /* determine storage type to write file data */ + if (meta->storage == FILE_STORAGE_LOGIO) { + /* file stored in logged i/o */ + rc = fid_logio_write(client, meta, pos, buf, count, nwritten); + if (rc == UNIFYFS_SUCCESS) { + /* write succeeded, remember that we have new data + * that needs to be synced with the server */ + meta->needs_sync = 1; + + /* optionally sync after every write */ + if (client->use_write_sync) { + int ret = unifyfs_fid_sync_extents(client, fid); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("client sync after write failed"); + rc = ret; + } + } + } + } else { + /* unknown storage type */ + LOGERR("unknown storage type for fid=%d", fid); + rc = EIO; + } + + return rc; +} + diff --git a/client/src/unifyfs_fid.h b/client/src/unifyfs_fid.h new file mode 100644 index 000000000..66c9e3af0 --- /dev/null +++ b/client/src/unifyfs_fid.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_FID_H +#define UNIFYFS_FID_H + +#include "unifyfs_api_internal.h" + + +/* --- file id (fid) management --- */ + +/* Allocate a fid slot for a new entry. + * Return the fid or -1 on error */ +int unifyfs_fid_alloc(unifyfs_client* client); + +/* Add the fid slot back to the free pool */ +int unifyfs_fid_free(unifyfs_client* client, + int fid); + + +/* --- fid metadata updates --- */ + +/* Add a new file and initialize metadata. + * Returns the new fid, or negative error value */ +int unifyfs_fid_create_file(unifyfs_client* client, + const char* path, + int exclusive); + +/* Add a new directory and initialize metadata. + * Returns the new fid, or a negative error value */ +int unifyfs_fid_create_directory(unifyfs_client* client, + const char* path); + +/* Opens a file with specified path, access flags, and permissions. + * Sets *outfid to file id and *outpos to current file position. + */ +int unifyfs_fid_open(unifyfs_client* client, + const char* path, + int flags, + mode_t mode, + int* outfid, + off_t* outpos); + +/* Close file with given file id */ +int unifyfs_fid_close(unifyfs_client* client, + int fid); + +/* Update local metadata for file from global metadata */ +int unifyfs_fid_update_file_meta(unifyfs_client* client, + int fid, + unifyfs_file_attr_t* gfattr); + +/* Unlink file and then delete its associated state */ +int unifyfs_fid_unlink(unifyfs_client* client, + int fid); + +/* Release the file's metadata and storage resources, and return + * the fid to free stack */ +int unifyfs_fid_delete(unifyfs_client* client, + int fid); + +/* Use local file metadata to update global metadata */ +int unifyfs_set_global_file_meta_from_fid(unifyfs_client* client, + int fid, + unifyfs_file_attr_op_e op); + + +/* --- fid metadata queries --- */ + +/* Given a file id, return a pointer to the metadata, + * otherwise return NULL */ +unifyfs_filemeta_t* unifyfs_get_meta_from_fid(unifyfs_client* client, + int fid); + +/* Return 1 if fid is laminated, 0 if not */ +int unifyfs_fid_is_laminated(unifyfs_client* client, + int fid); + +/* Given a fid, return the path */ +const char* unifyfs_path_from_fid(unifyfs_client* client, + int fid); + +/* Given a path, return the fid */ +int unifyfs_fid_from_path(unifyfs_client* client, + const char* path); + +/* Given a fid, return a gfid */ +int unifyfs_gfid_from_fid(unifyfs_client* client, + int fid); + +/* Returns fid for corresponding gfid, if one is active. + * Otherwise, returns -1 */ +int unifyfs_fid_from_gfid(unifyfs_client* client, + int gfid); + +/* Checks to see if fid is a directory. + * Returns 1 for yes, 0 for no */ +int unifyfs_fid_is_dir(unifyfs_client* client, + int fid); + +/* Checks to see if a directory is empty. + * Assumes that check for is_dir has already been made. + * Only checks for full path matches, does not check relative paths + * (i.e., '../dirname' will not work). + * Returns 1 for yes it is empty, 0 for no */ +int unifyfs_fid_is_dir_empty(unifyfs_client* client, + const char* path); + +/* Return current global size of given file id */ +off_t unifyfs_fid_global_size(unifyfs_client* client, + int fid); + +/* Return current size of given file id. If the file is laminated, + * returns the global size. Otherwise, returns the local size. */ +off_t unifyfs_fid_logical_size(unifyfs_client* client, + int fid); + + +/* --- fid I/O operations --- */ + +/* Write count bytes from buf into file starting at offset pos */ +int unifyfs_fid_write( + unifyfs_client* client, + int fid, /* local file id to write to */ + off_t pos, /* starting offset within file */ + const void* buf, /* buffer of data to be written */ + size_t count, /* number of bytes to write */ + size_t* nwritten /* returns number of bytes written */ +); + +/* Truncate file to given length. Removes or truncates file extents + * in metadata that are past the given length. */ +int unifyfs_fid_truncate(unifyfs_client* client, + int fid, + off_t length); + +/* Sync extent data for file to server if needed */ +int unifyfs_fid_sync_extents(unifyfs_client* client, + int fid); + +#endif /* UNIFYFS_FID_H */ diff --git a/client/src/unifyfs_wrap.h b/client/src/unifyfs_wrap.h new file mode 100644 index 000000000..f83ff770e --- /dev/null +++ b/client/src/unifyfs_wrap.h @@ -0,0 +1,115 @@ +#ifndef UNIFYFS_WRAP_H +#define UNIFYFS_WRAP_H + +#include "config.h" + +/* single function to route all unsupported wrapper calls through */ +void unifyfs_vunsupported(const char* fn_name, + const char* file, + int line, + const char* fmt, + va_list args); + +void unifyfs_unsupported(const char* fn_name, + const char* file, + int line, + const char* fmt, + ...); + +/* Define a macro to indicate unsupported function. Capture function name, + * file name, and line number along with a user-defined string */ +#define UNIFYFS_UNSUPPORTED(fmt, args...) \ + unifyfs_unsupported(__func__, __FILE__, __LINE__, fmt, ##args) + + +#if UNIFYFS_GOTCHA + +/* Definitions to support wrapping functions using Gotcha */ + +#include + +/* Our wrapper function uses the name __wrap_ for */ +#define UNIFYFS_WRAP(name) __wrap_##name + +/* the name of the real function pointer */ +#define UNIFYFS_REAL(name) __real_##name + +/* Declare gotcha handle, real function pointer, and our wrapper */ +#define UNIFYFS_DECL(name, ret, args) \ + extern gotcha_wrappee_handle_t wrappee_handle_##name; \ + extern ret (*__real_##name) args; \ + ret __wrap_##name args + +/* ask gotcha for the address of the real function */ +#define MAP_OR_FAIL(name) \ + do { \ + if (NULL == __real_##name) { \ + __real_##name = gotcha_get_wrappee(wrappee_handle_##name); \ + if (NULL == __real_##name) { \ + assert(!"missing Gotcha wrappee for " #name); \ + } \ + } \ + } while (0) + +int setup_gotcha_wrappers(void); + +#elif UNIFYFS_PRELOAD + +/* ======================================================================== + * Using LD_PRELOAD to wrap functions + * ======================================================================== + * We need to use the same function names the application is calling, and + * we then invoke the real function after looking it up with dlsym() */ + +/* we need the dlsym() function */ +#include + +/* Our wrapper uses the original function name */ +#define UNIFYFS_WRAP(name) name + +/* the address of the real open call is stored in __real_open variable */ +#define UNIFYFS_REAL(name) __real_##name + +/* Declare a static variable called __real_ to record the + * address of the real function and initialize it to NULL */ +#define UNIFYFS_DECL(name, ret, args) \ + static ret (*__real_##name) args = NULL; + +/* if __real_ is still NULL, call dlsym to lookup address of real + * function and record it */ +#define MAP_OR_FAIL(func) \ + if (NULL == __real_##func) { \ + __real_##func = dlsym(RTLD_NEXT, #func); \ + if (NULL == __real_##func) { \ + fprintf(stderr, "UnifyFS failed to map symbol: %s\n", #func); \ + abort(); \ + } \ + } + +#else /* Use linker wrapping */ + +/* ======================================================================== + * Using ld -wrap option to wrap functions + * ======================================================================== + * The linker converts application calls from --> __wrap_, + * and renames the wrapped function to __real_. We define our + * wrapper functions as the __wrap_ variant and then to call the + * real function, we use __real_ */ + +/* Our wrapper function uses the name __wrap_ for */ +#define UNIFYFS_WRAP(name) __wrap_##name + +/* The linker renames the wrapped function to __real_ */ +#define UNIFYFS_REAL(name) __real_##name + +/* Declare the existence of the real function and our wrapper */ +#define UNIFYFS_DECL(name, ret, args) \ + extern ret __real_##name args; \ + ret __wrap_##name args; + +/* no need to look up the address of the real function */ +#define MAP_OR_FAIL(func) + +#endif /* wrapping mode */ + +#endif /* UNIFYFS_WRAP_H */ diff --git a/client/src/unifyfsf.c b/client/src/unifyfsf.c index 07e405ff7..1a6db01e1 100644 --- a/client/src/unifyfsf.c +++ b/client/src/unifyfsf.c @@ -141,8 +141,9 @@ static int unifyfs_cstr2fstr(const char* cstr, char* fstr, int flen) FORTRAN_API void FORT_CALL unifyfs_mount_(char* prefix FORT_MIXED_LEN(prefix_len), - int* rank, int* size, int* app_id, - int* ierror FORT_END_LEN(prefix_len)) + int* rank, int* size, + int* ierror + FORT_END_LEN(prefix_len)) { /* convert name from a Fortran string to C string */ char prefix_tmp[1024]; @@ -155,8 +156,7 @@ void FORT_CALL unifyfs_mount_(char* prefix FORT_MIXED_LEN(prefix_len), int rank_tmp = *rank; int size_tmp = *size; - int app_id_tmp = *app_id; - *ierror = unifyfs_mount(prefix_tmp, rank_tmp, size_tmp, app_id_tmp); + *ierror = unifyfs_mount(prefix_tmp, rank_tmp, size_tmp); return; } diff --git a/common/src/Makefile.mk b/common/src/Makefile.mk index 8069d4478..7842bafac 100644 --- a/common/src/Makefile.mk +++ b/common/src/Makefile.mk @@ -19,6 +19,7 @@ UNIFYFS_COMMON_BASE_SRCS = \ %reldir%/tinyexpr.h \ %reldir%/tinyexpr.c \ %reldir%/tree.h \ + %reldir%/unifyfs_client.h \ %reldir%/unifyfs_const.h \ %reldir%/unifyfs_configurator.h \ %reldir%/unifyfs_configurator.c \ diff --git a/common/src/arraylist.c b/common/src/arraylist.c index 9f0763d28..0892adc15 100644 --- a/common/src/arraylist.c +++ b/common/src/arraylist.c @@ -99,6 +99,9 @@ void* arraylist_remove(arraylist_t* arr, int pos) return item; } +/* Inserts element at given index (pos) in the arraylist. + * Overwrites (and frees) any existing element at that index. + * Returns 0 on success, or -1 on error */ int arraylist_insert(arraylist_t* arr, int pos, void* elem) { if (NULL == arr) { @@ -133,36 +136,21 @@ int arraylist_insert(arraylist_t* arr, int pos, void* elem) return 0; } - +/* Adds element to the end of the current list. + * Returns list index of newly added element, or -1 on error */ int arraylist_add(arraylist_t* arr, void* elem) { if (NULL == arr) { return -1; } - if (arr->size == arr->cap) { - int newcap = 2 * arr->cap; - void** newlist = (void**) realloc(arr->elems, - newcap * sizeof(void*)); - if (NULL == newlist) { - return -1; - } - arr->elems = newlist; - arr->cap = newcap; - - int i; - for (i = arr->size; i < newcap; i++) { - arr->elems[i] = NULL; - } - } - - if (arr->elems[arr->size] != NULL) { - free(arr->elems[arr->size]); + int pos = arr->size; + int rc = arraylist_insert(arr, pos, elem); + if (rc == -1) { + return rc; + } else { + return pos; } - arr->elems[arr->size] = elem; - arr->size += 1; - - return 0; } int arraylist_reset(arraylist_t* arr) diff --git a/common/src/unifyfs_client.h b/common/src/unifyfs_client.h new file mode 100644 index 000000000..ab05b1b47 --- /dev/null +++ b/common/src/unifyfs_client.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_CLIENT_H +#define UNIFYFS_CLIENT_H + +#include "unifyfs_logio.h" +#include "unifyfs_meta.h" +#include "unifyfs_shm.h" + + + /* client state used by both client library & server */ +typedef struct client_state { + /* mountpoint information */ + char* mount_prefix; /* mountpoint prefix string */ + size_t mount_prefixlen; /* strlen() of mount_prefix */ + int app_id; /* application id (gfid for mountpoint) */ + int client_id; /* client id within application */ + bool is_mounted; /* is mountpoint active? */ + + /* application rank (for debugging) */ + int app_rank; + + /* tracks current working directory within namespace */ + char* cwd; + + /* has all the client's state (below) been initialized? */ + bool initialized; + + /* log-based I/O context */ + logio_context* logio_ctx; + + /* superblock - shared memory region for client metadata */ + shm_context* shm_super_ctx; + unifyfs_write_index write_index; + +} unifyfs_client_state; + +#endif /* UNIFYFS_CLIENT_H */ diff --git a/common/src/unifyfs_client_rpcs.h b/common/src/unifyfs_client_rpcs.h index c5d162109..3aaa398e8 100644 --- a/common/src/unifyfs_client_rpcs.h +++ b/common/src/unifyfs_client_rpcs.h @@ -41,6 +41,7 @@ typedef enum { UNIFYFS_CLIENT_RPC_MOUNT, UNIFYFS_CLIENT_RPC_READ, UNIFYFS_CLIENT_RPC_SYNC, + UNIFYFS_CLIENT_RPC_TRANSFER, UNIFYFS_CLIENT_RPC_TRUNCATE, UNIFYFS_CLIENT_RPC_UNLINK, UNIFYFS_CLIENT_RPC_UNMOUNT @@ -134,6 +135,35 @@ MERCURY_GEN_PROC(unifyfs_filesize_out_t, ((hg_size_t)(filesize))) DECLARE_MARGO_RPC_HANDLER(unifyfs_filesize_rpc) +/* unifyfs_transfer_rpc (client => server) + * + * given an app_id, client_id, transfer id, global file id, transfer mode, + * and a destination file path, transfer data to that file */ +MERCURY_GEN_PROC(unifyfs_transfer_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) + ((int32_t)(transfer_id)) + ((int32_t)(gfid)) + ((int32_t)(mode)) + ((hg_const_string_t)(dst_file))) +MERCURY_GEN_PROC(unifyfs_transfer_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_transfer_rpc) + +/* unifyfs_transfer_complete_rpc (server => client) + * + * Transfer completion response for a request with specified transfer_id. + * + * A non-zero error_code indicates the server encountered an error during + * processing of the request. */ +MERCURY_GEN_PROC(unifyfs_transfer_complete_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) + ((int32_t)(transfer_id)) + ((int32_t)(error_code))) +MERCURY_GEN_PROC(unifyfs_transfer_complete_out_t, ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_transfer_complete_rpc) + /* unifyfs_truncate_rpc (client => server) * * given an app_id, client_id, global file id, @@ -194,6 +224,8 @@ DECLARE_MARGO_RPC_HANDLER(unifyfs_mread_rpc) * read_offset is the offset to be added to the start offset of the request, * and is used to transfer data for very large extents in multiple chunks. */ MERCURY_GEN_PROC(unifyfs_mread_req_data_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) ((int32_t)(mread_id)) ((int32_t)(read_index)) ((hg_size_t)(read_offset)) @@ -211,6 +243,8 @@ DECLARE_MARGO_RPC_HANDLER(unifyfs_mread_req_data_rpc) * A non-zero read_error indicates the server encountered an error during * processing of the request. */ MERCURY_GEN_PROC(unifyfs_mread_req_complete_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) ((int32_t)(mread_id)) ((int32_t)(read_index)) ((int32_t)(read_error))) diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index 16c797705..2c12eda64 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -85,14 +85,11 @@ UNIFYFS_CFG(logio, spill_dir, STRING, NULLSTRING, "spillover directory", configurator_directory_check) \ UNIFYFS_CFG(margo, lazy_connect, BOOL, off, "wait until first communication with server to resolve its connection address", NULL) \ UNIFYFS_CFG(margo, tcp, BOOL, on, "use TCP for server-to-server margo RPCs", NULL) \ - UNIFYFS_CFG(meta, db_name, STRING, META_DEFAULT_DB_NAME, "metadata database name", NULL) \ - UNIFYFS_CFG(meta, db_path, STRING, RUNDIR, "metadata database path", configurator_directory_check) \ - UNIFYFS_CFG(meta, server_ratio, INT, META_DEFAULT_SERVER_RATIO, "metadata server ratio", NULL) \ - UNIFYFS_CFG(meta, range_size, INT, META_DEFAULT_RANGE_SZ, "metadata range size", NULL) \ + UNIFYFS_CFG(meta, range_size, INT, UNIFYFS_META_DEFAULT_SLICE_SZ, "metadata range size", NULL) \ UNIFYFS_CFG_CLI(runstate, dir, STRING, RUNDIR, "runstate file directory", configurator_directory_check, 'R', "specify full path to directory to contain server-local state") \ UNIFYFS_CFG_CLI(server, hostfile, STRING, NULLSTRING, "server hostfile name", NULL, 'H', "specify full path to server hostfile") \ UNIFYFS_CFG_CLI(server, init_timeout, INT, UNIFYFS_DEFAULT_INIT_TIMEOUT, "timeout of waiting for server initialization", NULL, 't', "timeout in seconds to wait for servers to be ready for clients") \ - UNIFYFS_CFG(server, max_app_clients, INT, MAX_APP_CLIENTS, "maximum number of clients per application", NULL) \ + UNIFYFS_CFG(server, max_app_clients, INT, UNIFYFS_SERVER_MAX_APP_CLIENTS, "maximum number of clients per application", NULL) \ UNIFYFS_CFG_CLI(sharedfs, dir, STRING, NULLSTRING, "shared file system directory", configurator_directory_check, 'S', "specify full path to directory to contain server shared files") \ #ifdef __cplusplus diff --git a/common/src/unifyfs_const.h b/common/src/unifyfs_const.h index b438c7899..d9afef6a0 100644 --- a/common/src/unifyfs_const.h +++ b/common/src/unifyfs_const.h @@ -30,62 +30,57 @@ #ifndef UNIFYFS_CONST_H #define UNIFYFS_CONST_H -/* ********************** RETURN CODES ************************ */ +/* --------------------- RETURN CODES --------------------- */ + #include "unifyfs_rc.h" -/* ********************** STRING CONSTANTS ************************ */ -#define DEFAULT_INTERFACE "ib0" -#define SOCKET_PATH "/tmp/unifyfs_server_sock" -/* ********************** INT CONSTANTS ************************ */ +/* --------------------- INT CONSTANTS --------------------- */ // Byte counts #define KIB 1024 #define MIB 1048576 #define GIB 1073741824 -// Generic -#define GEN_STR_LEN KIB +// General #define UNIFYFS_MAX_FILENAME KIB #define UNIFYFS_MAX_HOSTNAME 64 -// Server - Request Manager -#define MAX_DATA_TX_SIZE (4 * MIB) /* data transfer size (to client) */ -#define MAX_META_PER_SEND (4 * KIB) /* max read request count per server */ -#define REQ_BUF_LEN (MAX_META_PER_SEND * 64) /* chunk read reqs buffer size */ -#define SHM_WAIT_INTERVAL 1000 /* unit: ns */ -#define RM_MAX_SERVER_READS KIB - -// Server - General -#define MAX_BULK_TX_SIZE (8 * MIB) /* bulk transfer size (between servers) */ -#define MAX_NUM_APPS 64 /* max # apps/mountpoints supported */ -#define MAX_APP_CLIENTS 256 /* max # clients per application */ -#define MIN_USLEEP_INTERVAL 50 /* unit: us */ -#define UNIFYFS_DEFAULT_INIT_TIMEOUT 120 /* server init timeout (seconds) */ -#define UNIFYFSD_PID_FILENAME "unifyfsd.pids" -#define UNIFYFS_STAGE_STATUS_FILENAME "unifyfs-stage.status" - // Client #define UNIFYFS_CLIENT_MAX_FILES 128 -#define UNIFYFS_CLIENT_MAX_FILEDESCS UNIFYFS_CLIENT_MAX_FILES #define UNIFYFS_CLIENT_STREAM_BUFSIZE MIB #define UNIFYFS_CLIENT_WRITE_INDEX_SIZE (20 * MIB) -#define UNIFYFS_CLIENT_MAX_READ_COUNT KIB /* max # active read requests */ +#define UNIFYFS_CLIENT_MAX_READ_COUNT 1000 /* max # active read requests */ #define UNIFYFS_CLIENT_READ_TIMEOUT_SECONDS 60 #define UNIFYFS_CLIENT_MAX_ACTIVE_REQUESTS 64 /* max concurrent client reqs */ -// Log-based I/O +// Log-based I/O Default Values #define UNIFYFS_LOGIO_CHUNK_SIZE (4 * MIB) #define UNIFYFS_LOGIO_SHMEM_SIZE (256 * MIB) #define UNIFYFS_LOGIO_SPILL_SIZE (4 * GIB) -/* NOTE: max read size = UNIFYFS_MAX_SPLIT_CNT * META_DEFAULT_RANGE_SZ */ -#define UNIFYFS_MAX_SPLIT_CNT (4 * KIB) +// Metadata Default Values +#define UNIFYFS_META_DEFAULT_SLICE_SZ MIB /* data slice size for metadata */ + +// Server +#define UNIFYFS_SERVER_MAX_BULK_TX_SIZE (8 * MIB) /* to-server transmit size */ +#define UNIFYFS_SERVER_MAX_DATA_TX_SIZE (4 * MIB) /* to-client transmit size */ +#define UNIFYFS_SERVER_MAX_NUM_APPS 64 /* max # apps/mountpoints supported */ +#define UNIFYFS_SERVER_MAX_APP_CLIENTS 256 /* max # clients per application */ +#define UNIFYFS_SERVER_MAX_READS 2000 /* max server read reqs per reqmgr */ + +// Utilities +#define UNIFYFS_DEFAULT_INIT_TIMEOUT 120 /* server init timeout (seconds) */ + + +/* --------------------- STRING CONSTANTS --------------------- */ + +// Server +#define UNIFYFS_SERVER_PID_FILENAME "unifyfsd.pids" + +// Utilities +#define UNIFYFS_STAGE_STATUS_FILENAME "unifyfs-stage.status" -// Metadata/MDHIM Default Values -#define META_DEFAULT_DB_NAME unifyfs_db -#define META_DEFAULT_SERVER_RATIO 1 -#define META_DEFAULT_RANGE_SZ MIB #endif // UNIFYFS_CONST_H diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c index 0b746fb5c..29f212f33 100644 --- a/common/src/unifyfs_logio.c +++ b/common/src/unifyfs_logio.c @@ -55,16 +55,16 @@ slot_map* log_header_to_chunkmap(log_header* hdr) /* convenience method to return system page size */ size_t get_page_size(void) { - size_t unifyfs_page_size = 4096; + size_t page_size = 4096; long sz = sysconf(_SC_PAGESIZE); if (sz != -1) { - unifyfs_page_size = (size_t) sz; + page_size = (size_t) sz; } else { LOGERR("sysconf(_SC_PAGESIZE) failed - errno=%d (%s)", errno, strerror(errno)); } - LOGDBG("returning page size %zu B", unifyfs_page_size); - return unifyfs_page_size; + LOGDBG("returning page size %zu B", page_size); + return page_size; } /* calculate number of chunks needed for requested bytes */ diff --git a/common/src/unifyfs_meta.c b/common/src/unifyfs_meta.c index fdbba6a99..10a3bd735 100644 --- a/common/src/unifyfs_meta.c +++ b/common/src/unifyfs_meta.c @@ -19,7 +19,7 @@ #include "unifyfs_meta.h" /* extent slice size used for metadata */ -size_t meta_slice_sz = META_DEFAULT_RANGE_SZ; +size_t meta_slice_sz = UNIFYFS_META_DEFAULT_SLICE_SZ; /* calculate number of slices in an extent given by start offset and length */ size_t meta_num_slices(size_t offset, size_t length) diff --git a/common/src/unifyfs_meta.h b/common/src/unifyfs_meta.h index b21cc8ce9..5c71092b4 100644 --- a/common/src/unifyfs_meta.h +++ b/common/src/unifyfs_meta.h @@ -56,7 +56,7 @@ typedef struct { int gfid; } unifyfs_extent_t; -/* write-log metadata index structure */ +/* write-log metadata index structures */ typedef struct { off_t file_pos; /* start offset of data in file */ off_t log_pos; /* start offset of data in write log */ @@ -64,6 +64,14 @@ typedef struct { int gfid; /* global file id */ } unifyfs_index_t; +typedef struct { + size_t index_size; /* size of index metadata region in bytes */ + size_t index_offset; /* superblock offset of index metadata region */ + + size_t* ptr_num_entries; /* pointer to number of index entries */ + unifyfs_index_t* index_entries; /* pointer to first unifyfs_index_t */ +} unifyfs_write_index; + /* UnifyFS file attributes */ typedef struct { char* filename; diff --git a/common/src/unifyfs_rc.h b/common/src/unifyfs_rc.h index cf54ad904..627108cc5 100644 --- a/common/src/unifyfs_rc.h +++ b/common/src/unifyfs_rc.h @@ -48,7 +48,7 @@ ENUMITEM(NYI, "Not yet implemented") \ ENUMITEM(PMI, "PMI2/PMIx error") \ ENUMITEM(SHMEM, "Shared memory region init/access error") \ - ENUMITEM(THRDINIT, "Thread initialization failed") \ + ENUMITEM(THREAD, "POSIX thread operation failed") \ ENUMITEM(TIMEOUT, "Timed out") \ diff --git a/common/src/unifyfs_server_rpcs.h b/common/src/unifyfs_server_rpcs.h index cf46a9373..69e71ab86 100644 --- a/common/src/unifyfs_server_rpcs.h +++ b/common/src/unifyfs_server_rpcs.h @@ -31,6 +31,7 @@ extern "C" { #endif +/* enumerate the various server-to-server rpcs */ typedef enum { UNIFYFS_SERVER_RPC_INVALID = 0, UNIFYFS_SERVER_RPC_CHUNK_READ, @@ -41,14 +42,26 @@ typedef enum { UNIFYFS_SERVER_RPC_METAGET, UNIFYFS_SERVER_RPC_METASET, UNIFYFS_SERVER_RPC_PID_REPORT, + UNIFYFS_SERVER_RPC_TRANSFER, UNIFYFS_SERVER_RPC_TRUNCATE, UNIFYFS_SERVER_BCAST_RPC_EXTENTS, UNIFYFS_SERVER_BCAST_RPC_FILEATTR, UNIFYFS_SERVER_BCAST_RPC_LAMINATE, + UNIFYFS_SERVER_BCAST_RPC_TRANSFER, UNIFYFS_SERVER_BCAST_RPC_TRUNCATE, UNIFYFS_SERVER_BCAST_RPC_UNLINK } server_rpc_e; +/* structure to track server-to-server rpc request state */ +typedef struct { + server_rpc_e req_type; + hg_handle_t handle; + void* coll; + void* input; + void* bulk_buf; + size_t bulk_sz; +} server_rpc_req_t; + /*---- Server Point-to-Point (p2p) RPCs ----*/ /* Report server pid to rank 0 */ @@ -140,6 +153,19 @@ MERCURY_GEN_PROC(metaset_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(metaset_rpc) +/* Transfer file */ +MERCURY_GEN_PROC(transfer_in_t, + ((int32_t)(src_rank)) + ((int32_t)(client_app)) + ((int32_t)(client_id)) + ((int32_t)(transfer_id)) + ((int32_t)(gfid)) + ((int32_t)(mode)) + ((hg_const_string_t)(dst_file))) +MERCURY_GEN_PROC(transfer_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(transfer_rpc) + /* Truncate file at owner */ MERCURY_GEN_PROC(truncate_in_t, ((int32_t)(gfid)) @@ -148,7 +174,7 @@ MERCURY_GEN_PROC(truncate_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(truncate_rpc) -/*---- Collective RPCs ----*/ +/*---- Server Collective RPCs ----*/ /* Finish an ongoing broadcast rpc */ MERCURY_GEN_PROC(bcast_progress_in_t, @@ -188,6 +214,16 @@ MERCURY_GEN_PROC(laminate_bcast_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(laminate_bcast_rpc) +/* Broadcast transfer request to all servers */ +MERCURY_GEN_PROC(transfer_bcast_in_t, + ((int32_t)(root)) + ((int32_t)(gfid)) + ((int32_t)(mode)) + ((hg_const_string_t)(dst_file))) +MERCURY_GEN_PROC(transfer_bcast_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(transfer_bcast_rpc) + /* Broadcast truncation point to all servers */ MERCURY_GEN_PROC(truncate_bcast_in_t, ((int32_t)(root)) diff --git a/docs/api.rst b/docs/api.rst index 2cd7ed8cb..51778ed39 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -54,17 +54,16 @@ For instance, to use UnifyFS on all path prefixes that begin with .. code-block:: C :caption: C - unifyfs_mount('/unifyfs', rank, rank_num, 0); + unifyfs_mount('/unifyfs', rank, rank_num); .. code-block:: Fortran :caption: Fortran - call UNIFYFS_MOUNT('/unifyfs', rank, size, 0, ierr); + call UNIFYFS_MOUNT('/unifyfs', rank, size, ierr); Here, ``/unifyfs`` is the path prefix for UnifyFS to intercept. The ``rank`` parameter specifies the MPI rank of the calling process. The ``size`` parameter specifies the number of MPI ranks in the user job. -Lastly, the zero corresponds to the app id. --------------------------- Unmounting diff --git a/docs/library_api.rst b/docs/library_api.rst index 8da1d80b6..7bb0af843 100644 --- a/docs/library_api.rst +++ b/docs/library_api.rst @@ -239,7 +239,8 @@ Existing files can be opened by any client process using ``unifyfs_open()``. const char* filename = "/my/unifyfs/namespace/an/existing/file"; unifyfs_gfid gfid = UNIFYFS_INVALID_GFID; - int rc = unifyfs_open(fshdl, filename, &gfid); + int access_flags = O_RDWR; + int rc = unifyfs_open(fshdl, access_flags, filename, &gfid); When no longer required, files can be deleted using ``unifyfs_remove()``. diff --git a/examples/src/Makefile.am b/examples/src/Makefile.am index 215f9dee1..06f0f3db7 100644 --- a/examples/src/Makefile.am +++ b/examples/src/Makefile.am @@ -26,7 +26,8 @@ libexec_PROGRAMS += \ size-static \ transfer-static \ write-static \ - writeread-static + writeread-static \ + write-transfer-static endif #HAVE_LD_WRAP if HAVE_GOTCHA @@ -44,7 +45,8 @@ libexec_PROGRAMS += \ size-gotcha \ transfer-gotcha \ write-gotcha \ - writeread-gotcha + writeread-gotcha \ + write-transfer-gotcha if HAVE_HDF5 libexec_PROGRAMS += \ @@ -261,3 +263,12 @@ writeread_static_SOURCES = writeread.c testutil.c $(testutil_headers) writeread_static_CPPFLAGS = $(ex_mpi_cppflags) writeread_static_LDADD = $(ex_static_mpi_ldadd) writeread_static_LDFLAGS = $(ex_static_ldflags) + +write_transfer_gotcha_SOURCES = write-transfer.c testutil.c $(testutil_headers) +write_transfer_gotcha_CPPFLAGS = $(ex_mpi_cppflags) +write_transfer_gotcha_LDADD = $(ex_gotcha_mpi_ldadd) + +write_transfer_static_SOURCES = write-transfer.c testutil.c $(testutil_headers) +write_transfer_static_CPPFLAGS = $(ex_mpi_cppflags) +write_transfer_static_LDADD = $(ex_static_mpi_ldadd) +write_transfer_static_LDFLAGS = $(ex_static_ldflags) diff --git a/examples/src/app-btio.c b/examples/src/app-btio.c index a238f38da..230280d67 100644 --- a/examples/src/app-btio.c +++ b/examples/src/app-btio.c @@ -79,8 +79,7 @@ typedef struct { long offset; long length; char* buf; - -} read_req_t; +} rdreq_t; int main(int argc, char* argv[]) { @@ -175,7 +174,7 @@ int main(int argc, char* argv[]) /* calculate the number of I/O requests of writing these 3D cells */ long num_reqs = elems_per_tile * elems_per_tile * nr_tiles_per_proc; - read_req_t* r_w_reqs = (read_req_t*) malloc(num_reqs * sizeof(read_req_t)); + rdreq_t* r_w_reqs = (rdreq_t*) calloc(num_reqs, sizeof(rdreq_t)); /*initialize these I/O requests */ long cursor = 0, tot_sz; @@ -197,7 +196,7 @@ int main(int argc, char* argv[]) memset(buf, 0, elems_per_tile * SZ_PER_ELEM); MPI_Barrier(MPI_COMM_WORLD); - unifyfs_mount("/unifyfs", rank, ranknum, 0); + unifyfs_mount("/unifyfs", rank, ranknum); MPI_Barrier(MPI_COMM_WORLD); int fd = open(fname, O_RDWR | O_CREAT | O_TRUNC, 0600); diff --git a/examples/src/app-hdf5-create.c b/examples/src/app-hdf5-create.c index 86817b998..4ab912990 100644 --- a/examples/src/app-hdf5-create.c +++ b/examples/src/app-hdf5-create.c @@ -152,7 +152,7 @@ int main(int argc, char** argv) test_pause(rank, "Attempting to mount"); if (!standard) { - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + ret = unifyfs_mount(mountpoint, rank, total_ranks); if (ret) { test_print(rank, "unifyfs_mount failed (return = %d)", ret); exit(-1); diff --git a/examples/src/app-hdf5-writeread.c b/examples/src/app-hdf5-writeread.c index f2c371bd8..782ef70e9 100644 --- a/examples/src/app-hdf5-writeread.c +++ b/examples/src/app-hdf5-writeread.c @@ -172,7 +172,7 @@ int main(int argc, char** argv) test_pause(rank, "Attempting to mount"); if (!standard) { - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + ret = unifyfs_mount(mountpoint, rank, total_ranks); if (ret) { test_print(rank, "unifyfs_mount failed (return = %d)", ret); exit(-1); diff --git a/examples/src/app-mpiio.c b/examples/src/app-mpiio.c index 8444134f9..51a0b4369 100644 --- a/examples/src/app-mpiio.c +++ b/examples/src/app-mpiio.c @@ -342,7 +342,7 @@ int main(int argc, char* argv[]) if (debug) { test_pause(rank, "Attempting to mount"); } - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + ret = unifyfs_mount(mountpoint, rank, total_ranks); if (ret) { test_print(rank, "unifyfs_mount failed (return = %d)", ret); exit(-1); diff --git a/examples/src/app-tileio.c b/examples/src/app-tileio.c index 53640c140..1e52b8768 100644 --- a/examples/src/app-tileio.c +++ b/examples/src/app-tileio.c @@ -83,8 +83,7 @@ typedef struct { long offset; long length; char* buf; - -} read_req_t; +} rdreq_t; static const char* opts = "h:v:d:f:p:n:"; @@ -158,7 +157,7 @@ int main(int argc, char* argv[]) long num_reqs = y_size; /*number of I/O requests*/ - read_req_t* r_w_reqs = (read_req_t*) malloc(num_reqs * sizeof(read_req_t)); + rdreq_t* r_w_reqs = (rdreq_t*) calloc(num_reqs, sizeof(rdreq_t)); /*initialize the I/O requests*/ long cursor = 0, tot_sz; @@ -175,7 +174,7 @@ int main(int argc, char* argv[]) memset(buf, 0, x_size * sz_per_elem); MPI_Barrier(MPI_COMM_WORLD); - unifyfs_mount("/unifyfs", rank, ranknum, 0); + unifyfs_mount("/unifyfs", rank, ranknum); MPI_Barrier(MPI_COMM_WORLD); int fd; diff --git a/examples/src/read-data.c b/examples/src/read-data.c index af0e66f61..e2bad8f0d 100644 --- a/examples/src/read-data.c +++ b/examples/src/read-data.c @@ -311,7 +311,7 @@ int main(int argc, char** argv) if (strncmp(filename, mountpoint, strlen(mountpoint)) == 0) { printf("mounting unifyfs at %s ..\n", mountpoint); - ret = unifyfs_mount(mountpoint, 0, 1, 0); + ret = unifyfs_mount(mountpoint, 0, 1); if (ret) { fprintf(stderr, "unifyfs_mount failed (return = %d)\n", ret); return -1; diff --git a/examples/src/simul.c b/examples/src/simul.c index 3b1c2fa85..859f74383 100644 --- a/examples/src/simul.c +++ b/examples/src/simul.c @@ -1254,7 +1254,7 @@ int main(int argc, char **argv) { if (!testdir) { testdir = "/unifyfs"; } - ret = unifyfs_mount(testdir, rank, size, 0); + ret = unifyfs_mount(testdir, rank, size); if (ret && rank == 0) { printf("unifyfs_mount failed (ret=%d)\n", ret); MPI_Abort(MPI_COMM_WORLD, 2); diff --git a/examples/src/testutil.c b/examples/src/testutil.c index bb3b40256..18540c9cd 100644 --- a/examples/src/testutil.c +++ b/examples/src/testutil.c @@ -190,13 +190,9 @@ char* mktemp_cmd(test_cfg* cfg, char* tmpdir) return NULL; } - /* If we can get our job ID, use it to seed our RNG. Else use our app_id */ - job_id = getenv("LSB_JOBID"); - if (job_id) { - srand(atol(job_id)); - } else { - srand(cfg->app_id); - } + /* seed our RNG */ + unsigned int now = (unsigned int) time(NULL); + srand(now); do { /* diff --git a/examples/src/testutil.h b/examples/src/testutil.h index 3bc6ec595..ad1290108 100644 --- a/examples/src/testutil.h +++ b/examples/src/testutil.h @@ -39,6 +39,7 @@ #ifndef DISABLE_UNIFYFS # include +# include #endif /* ---------- Common Types and Definitions ---------- */ @@ -132,6 +133,7 @@ typedef struct { int pre_wr_trunc; /* truncate file before writing */ int post_wr_trunc; /* truncate file after writing */ int use_aio; /* use asynchronous IO */ + int use_api; /* use UnifyFS library API */ int use_lio; /* use lio_listio instead of read/write */ int use_mapio; /* use mmap instead of read/write */ int use_mpiio; /* use MPI-IO instead of POSIX I/O */ @@ -156,12 +158,20 @@ typedef struct { size_t mapped_sz; /* size of mapped extent */ MPI_File mpifh; /* MPI file handle (when use_mpiio) */ + /* transfer destination file */ + char* dest_filename; + int dest_fd; + MPI_File dest_mpifh; /* MPI file handle (when use_mpiio) */ + +#ifndef DISABLE_UNIFYFS + /* UnifyFS API info */ + unifyfs_handle fshdl; /* file system client handle */ + unifyfs_gfid gfid; /* global file id for target */ +#endif + /* MPI info */ int rank; int n_ranks; - - /* UnifyFS info */ - int app_id; } test_cfg; static inline @@ -176,6 +186,12 @@ void test_config_init(test_cfg* cfg) // set everything to 0/NULL memset(cfg, 0, sizeof(test_cfg)); +#ifndef DISABLE_UNIFYFS + // invalidate UnifyFS API state + cfg->fshdl = UNIFYFS_INVALID_HANDLE; + cfg->gfid = UNIFYFS_INVALID_GFID; +#endif + // N-to-1 UnifyFS by default cfg->use_mpi = 1; cfg->use_unifyfs = 1; @@ -223,6 +239,7 @@ void test_config_print(test_cfg* cfg) fprintf(fp, "\t pre_trunc = %d\n", cfg->pre_wr_trunc); fprintf(fp, "\t post_trunc = %d\n", cfg->post_wr_trunc); fprintf(fp, "\t use_aio = %d\n", cfg->use_aio); + fprintf(fp, "\t use_api = %d\n", cfg->use_api); fprintf(fp, "\t use_lio = %d\n", cfg->use_lio); fprintf(fp, "\t use_mapio = %d\n", cfg->use_mapio); fprintf(fp, "\t use_mpiio = %d\n", cfg->use_mpiio); @@ -238,10 +255,14 @@ void test_config_print(test_cfg* cfg) fprintf(fp, "\n-- Target File --\n"); fprintf(fp, "\t filename = %s\n", cfg->filename); - fprintf(fp, "\t mountpt = %s\n", cfg->mountpt); + fprintf(fp, "\t mountpoint = %s\n", cfg->mountpt); + + if (NULL != cfg->dest_filename) { + fprintf(fp, "\n-- Transfer Destination File --\n"); + fprintf(fp, "\t filename = %s\n", cfg->dest_filename); + } fprintf(fp, "\n-- MPI Info --\n"); - fprintf(fp, "\t app_id = %d\n", cfg->app_id); fprintf(fp, "\t rank = %d\n", cfg->rank); fprintf(fp, "\t n_ranks = %d\n", cfg->n_ranks); fprintf(fp, "\n==========================\n\n"); @@ -264,6 +285,12 @@ char* test_target_filename(test_cfg* cfg) return strdup(fname); } +static inline +char* test_destination_filename(test_cfg* cfg) +{ + return strdup(cfg->dest_filename); +} + /* ---------- Print Utilities ---------- */ static inline @@ -493,17 +520,18 @@ int test_is_static(const char* program) // common options for all tests -static const char* test_short_opts = "a:Ab:c:df:hkLm:MNn:o:p:PrSt:T:UvVx"; +static const char* test_short_opts = "Ab:c:dD:f:hklLm:Mn:No:p:PrSt:T:UvVx"; static const struct option test_long_opts[] = { - { "appid", 1, 0, 'a' }, { "aio", 0, 0, 'A' }, { "blocksize", 1, 0, 'b' }, { "chunksize", 1, 0, 'c' }, { "debug", 0, 0, 'd' }, + { "destfile", 1, 0, 'D' }, { "file", 1, 0, 'f' }, { "help", 0, 0, 'h' }, { "check", 0, 0, 'k' }, + { "library-api", 0, 0, 'l' }, { "listio", 0, 0, 'L' }, { "mount", 1, 0, 'm' }, { "mpiio", 0, 0, 'M' }, @@ -528,8 +556,6 @@ static const char* test_usage_str = "Usage: %s [options...]\n" "\n" "Available options:\n" - " -a, --appid= use given application id\n" - " (default: 0)\n" " -A, --aio use asynchronous I/O instead of read|write\n" " (default: off)\n" " -b, --blocksize= I/O block size\n" @@ -538,10 +564,14 @@ static const char* test_usage_str = " (default: 1 MiB)\n" " -d, --debug for debugging, wait for input (at rank 0) at start\n" " (default: off)\n" + " -D, --destfile= transfer destination file name (or path) outside mountpoint\n" + " (default: none)\n" " -f, --file= target file name (or path) under mountpoint\n" " (default: 'testfile')\n" " -k, --check check data contents upon read\n" " (default: off)\n" + " -l, --library-api use UnifyFS library API instead of POSIX I/O\n" + " (default: off)\n" " -L, --listio use lio_listio instead of read|write\n" " (default: off)\n" " -m, --mount= use for unifyfs\n" @@ -597,10 +627,6 @@ int test_process_argv(test_cfg* cfg, while ((ch = getopt_long(argc, argv, test_short_opts, test_long_opts, NULL)) != -1) { switch (ch) { - case 'a': - cfg->app_id = atoi(optarg); - break; - case 'A': cfg->use_aio = 1; break; @@ -617,6 +643,10 @@ int test_process_argv(test_cfg* cfg, cfg->debug = 1; break; + case 'D': + cfg->dest_filename = strdup(optarg); + break; + case 'f': cfg->filename = strdup(optarg); break; @@ -625,6 +655,10 @@ int test_process_argv(test_cfg* cfg, cfg->io_check = 1; break; + case 'l': + cfg->use_api = 1; + break; + case 'L': cfg->use_lio = 1; break; @@ -748,13 +782,28 @@ int test_process_argv(test_cfg* cfg, } } if (cfg->use_aio && - (cfg->use_mapio || cfg->use_mpiio || cfg->use_prdwr + (cfg->use_api || cfg->use_mapio || cfg->use_mpiio || cfg->use_prdwr || cfg->use_stdio || cfg->use_vecio)) { test_print_once(cfg, "USAGE ERROR: --aio incompatible with " - "[--mapio, --mpiio, --prdwr, --stdio, --vecio]"); + "[--library-api, --mapio, --mpiio, --prdwr, --stdio, --vecio]"); exit(-1); } + if (cfg->use_api) { + if (cfg->use_lio || cfg->use_mapio || cfg->use_mpiio || cfg->use_prdwr + || cfg->use_stdio || cfg->use_vecio) { + test_print_once(cfg, + "USAGE ERROR: --library-api incompatible with " + "[--listio, --mapio, --mpiio, --prdwr, --stdio, --vecio]"); + exit(-1); + } + if (!cfg->use_unifyfs) { + test_print_once(cfg, + "USAGE ERROR: --library-api incompatible with " + "--disable-unifyfs"); + exit(-1); + } + } if (cfg->use_lio && (cfg->use_mapio || cfg->use_mpiio || cfg->use_prdwr || cfg->use_stdio || cfg->use_vecio)) { @@ -1031,7 +1080,17 @@ int test_open_file(test_cfg* cfg, const char* filepath, int access) assert(NULL != cfg); - if (cfg->use_mpiio) { + if (cfg->use_api) { +#ifndef DISABLE_UNIFYFS + unifyfs_rc rc = unifyfs_open(cfg->fshdl, access, filepath, + &(cfg->gfid)); + if (UNIFYFS_SUCCESS != rc) { + test_print(cfg, "ERROR: unifyfs_open(%s) failed - %s", + filepath, unifyfs_rc_enum_description(rc)); + return -1; + } +#endif + } else if (cfg->use_mpiio) { int amode = test_access_to_mpiio_mode(access); if (cfg->io_pattern == IO_PATTERN_N1) { MPI_CHECK(cfg, (MPI_File_open(MPI_COMM_WORLD, filepath, amode, @@ -1043,6 +1102,7 @@ int test_open_file(test_cfg* cfg, const char* filepath, int access) if (mpi_error) { return -1; } + cfg->fd = 1; // dummy value to denote target } else if (cfg->use_stdio) { fmode = test_access_to_stdio_mode(access); fp = fopen(filepath, fmode); @@ -1063,6 +1123,44 @@ int test_open_file(test_cfg* cfg, const char* filepath, int access) return 0; } +/* + * open the given file + */ +static inline +int test_open_destination_file(test_cfg* cfg, int access) +{ + FILE* fp = NULL; + const char* fmode; + int fd = -1; + + assert(NULL != cfg); + + if (cfg->use_mpiio) { + int amode = test_access_to_mpiio_mode(access); + if (cfg->io_pattern == IO_PATTERN_N1) { + MPI_CHECK(cfg, (MPI_File_open(MPI_COMM_WORLD, cfg->dest_filename, + amode, MPI_INFO_NULL, + &cfg->dest_mpifh))); + } else { + MPI_CHECK(cfg, (MPI_File_open(MPI_COMM_SELF, cfg->dest_filename, + amode, MPI_INFO_NULL, + &cfg->dest_mpifh))); + } + if (mpi_error) { + return -1; + } + cfg->dest_fd = 2; // dummy value to denote destination + } else { + fd = open(cfg->dest_filename, access); + if (-1 == fd) { + test_print(cfg, "ERROR: open(%s) failed", cfg->dest_filename); + return -1; + } + cfg->dest_fd = fd; + } + return 0; +} + /* * close the given file */ @@ -1071,12 +1169,21 @@ int test_close_file(test_cfg* cfg) { assert(NULL != cfg); + if (cfg->use_api) { +#ifndef DISABLE_UNIFYFS + cfg->gfid = UNIFYFS_INVALID_GFID; +#endif + return 0; + } + if (cfg->use_mpiio) { MPI_CHECK(cfg, (MPI_File_close(&cfg->mpifh))); + cfg->fd = -1; } if (NULL != cfg->fp) { fclose(cfg->fp); + cfg->fp = NULL; } if (NULL != cfg->mapped) { @@ -1085,7 +1192,30 @@ int test_close_file(test_cfg* cfg) if (-1 != cfg->fd) { close(cfg->fd); + cfg->fd = -1; } + + return 0; +} + +/* + * close the given file + */ +static inline +int test_close_destination_file(test_cfg* cfg) +{ + assert(NULL != cfg); + + if (cfg->use_mpiio) { + MPI_CHECK(cfg, (MPI_File_close(&cfg->dest_mpifh))); + } + + if (-1 != cfg->dest_fd) { + close(cfg->dest_fd); + } + + cfg->dest_fd = -1; + return 0; } @@ -1116,11 +1246,19 @@ int test_remove_file(test_cfg* cfg, const char* filepath) return 0; } - /* POSIX I/O - * N-to-1 - rank 0 deletes shared files - * N-to-N - all ranks delete per-process files */ + /* N-to-1 - rank 0 deletes shared files + * N-to-N - all ranks delete per-process files */ if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { - if (cfg->use_stdio) { + if (cfg->use_api) { +#ifndef DISABLE_UNIFYFS + unifyfs_rc urc = unifyfs_remove(cfg->fshdl, filepath); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "ERROR: unifyfs_remove(%s) failed - %s", + filepath, unifyfs_rc_enum_description(urc)); + return -1; + } +#endif + } else if (cfg->use_stdio) { rc = remove(filepath); if (rc) { test_print(cfg, "ERROR: remove(%s) failed", filepath); @@ -1151,6 +1289,7 @@ int test_create_file(test_cfg* cfg, const char* filepath, int access) assert(NULL != cfg); + /* MPI-IO */ if (cfg->use_mpiio) { create_mode = test_access_to_mpiio_mode(access); create_mode |= MPI_MODE_CREATE; @@ -1168,11 +1307,20 @@ int test_create_file(test_cfg* cfg, const char* filepath, int access) return 0; } - /* POSIX I/O - * N-to-1 - rank 0 creates shared files - * N-to-N - all ranks create per-process files */ + /* N-to-1 - rank 0 creates shared files + * N-to-N - all ranks create per-process files */ if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { - if (cfg->use_stdio) { + if (cfg->use_api) { +#ifndef DISABLE_UNIFYFS + unifyfs_rc urc = unifyfs_create(cfg->fshdl, 0, filepath, + &(cfg->gfid)); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "ERROR: unifyfs_create(%s) failed - %s", + filepath, unifyfs_rc_enum_description(urc)); + return -1; + } +#endif + } else if (cfg->use_stdio) { fmode = test_access_to_stdio_mode(access); fp = fopen(filepath, fmode); if (NULL == fp) { @@ -1321,16 +1469,28 @@ int test_init(int argc, char** argv, test_config_print(cfg); } - if (cfg->use_unifyfs && !cfg->enable_mpi_mount) { + if (cfg->use_unifyfs) { #ifndef DISABLE_UNIFYFS if (cfg->debug) { - test_pause(cfg, "Before unifyfs_mount()"); + test_pause(cfg, "Before mounting UnifyFS"); } - rc = unifyfs_mount(cfg->mountpt, cfg->rank, cfg->n_ranks, cfg->app_id); - if (rc) { - test_print(cfg, "ERROR: unifyfs_mount() failed (rc=%d)", rc); - test_abort(cfg, rc); - return -1; + if (cfg->use_api) { + unifyfs_rc urc = unifyfs_initialize(cfg->mountpt, NULL, 0, + &(cfg->fshdl)); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "ERROR: unifyfs_initialize(%s) failed (%s)", + cfg->mountpt, unifyfs_rc_enum_description(urc)); + test_abort(cfg, (int)urc); + return -1; + } + } else if (!cfg->enable_mpi_mount) { + rc = unifyfs_mount(cfg->mountpt, cfg->rank, cfg->n_ranks); + if (rc) { + test_print(cfg, "ERROR: unifyfs_mount(%s) failed (rc=%d)", + cfg->mountpt, rc); + test_abort(cfg, rc); + return -1; + } } #endif test_barrier(cfg); @@ -1356,11 +1516,20 @@ void test_fini(test_cfg* cfg) test_close_file(cfg); - if (cfg->use_unifyfs && !cfg->enable_mpi_mount) { + if (cfg->use_unifyfs) { #ifndef DISABLE_UNIFYFS - int rc = unifyfs_unmount(); - if (rc) { - test_print(cfg, "ERROR: unifyfs_unmount() failed (rc=%d)", rc); + if (cfg->use_api) { + unifyfs_rc urc = unifyfs_finalize(cfg->fshdl); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "ERROR: unifyfs_finalize() failed - %s", + unifyfs_rc_enum_description(urc)); + } + cfg->fshdl = UNIFYFS_INVALID_HANDLE; + } else if (!cfg->enable_mpi_mount) { + int rc = unifyfs_unmount(); + if (rc) { + test_print(cfg, "ERROR: unifyfs_unmount() failed (rc=%d)", rc); + } } #endif } diff --git a/examples/src/testutil_rdwr.h b/examples/src/testutil_rdwr.h index a372608b8..b439b57c3 100644 --- a/examples/src/testutil_rdwr.h +++ b/examples/src/testutil_rdwr.h @@ -17,6 +17,18 @@ #include "testutil.h" +#ifndef DISABLE_UNIFYFS +static struct { + size_t n_reqs; + struct aiocb* reqs; + unifyfs_io_request* writes; +} unify_writes; +static struct { + size_t n_reqs; + struct aiocb* reqs; + unifyfs_io_request* reads; +} unify_reads; +#endif static inline void test_print_aiocb(test_cfg* cfg, struct aiocb* cbp) @@ -28,6 +40,9 @@ void test_print_aiocb(test_cfg* cfg, struct aiocb* cbp) /* -------- Write Helper Methods -------- */ +static inline +int issue_write_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs); + static inline int issue_write_req(test_cfg* cfg, struct aiocb* req) { @@ -48,6 +63,8 @@ int issue_write_req(test_cfg* cfg, struct aiocb* req) test_print(cfg, "aio_write() failed"); } return rc; + } else if (cfg->use_api || cfg->use_lio) { + return issue_write_req_batch(cfg, 1, req); } else if (cfg->use_mapio) { // mmap(2) return ENOTSUP; } else if (cfg->use_mpiio) { // MPI-IO @@ -112,7 +129,38 @@ int issue_write_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) assert(NULL != cfg); - if (cfg->use_lio) { // lio_listio(2) + + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + if (unify_writes.writes != NULL) { + /* release allocation from previous call */ + free(unify_writes.writes); + } + unify_writes.n_reqs = n_reqs; + unify_writes.reqs = reqs; + unify_writes.writes = calloc(n_reqs, sizeof(unifyfs_io_request)); + for (size_t i = 0; i < n_reqs; i++) { + struct aiocb* req = reqs + i; + unifyfs_io_request* wr = unify_writes.writes + i; + wr->op = UNIFYFS_IOREQ_OP_WRITE; + wr->gfid = cfg->gfid; + wr->nbytes = req->aio_nbytes; + wr->offset = req->aio_offset; + wr->user_buf = (void*) req->aio_buf; + } + + unifyfs_rc urc = unifyfs_dispatch_io(cfg->fshdl, n_reqs, + unify_writes.writes); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "unifyfs_dispatch_io(%s, OP_WRITE) failed - %s", + cfg->filename, unifyfs_rc_enum_description(urc)); + return -1; + } + return 0; +#endif + } else if (cfg->use_lio) { // lio_listio(2) struct aiocb* lio_vec[n_reqs]; for (i = 0; i < n_reqs; i++) { lio_vec[i] = reqs + i; @@ -145,6 +193,9 @@ int issue_write_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) } } +static inline +int wait_write_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs); + static inline int wait_write_req(test_cfg* cfg, struct aiocb* req) { @@ -175,6 +226,8 @@ int wait_write_req(test_cfg* cfg, struct aiocb* req) ss, req->aio_nbytes); return -1; } + } else if (cfg->use_api) { + return wait_write_req_batch(cfg, 1, req); } return 0; } @@ -187,6 +240,26 @@ int wait_write_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) assert(NULL != cfg); + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + if ((unify_writes.n_reqs != n_reqs) || (unify_writes.reqs != reqs)) { + /* wait args do not match previous write batch */ + test_print(cfg, "mismatched wait on unify_writes"); + return EINVAL; + } + unifyfs_rc urc = unifyfs_wait_io(cfg->fshdl, n_reqs, + unify_writes.writes, 1); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "unifyfs_wait_io(%s, WAITALL) failed - %s", + cfg->filename, unifyfs_rc_enum_description(urc)); + return -1; + } + return 0; +#endif + } + for (i = 0; i < n_reqs; i++) { rc = wait_write_req(cfg, reqs + i); if (rc) { @@ -202,7 +275,30 @@ int write_truncate(test_cfg* cfg) { int rc = 0; - if (cfg->use_mpiio) { + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + unifyfs_io_request req = {0}; + req.op = UNIFYFS_IOREQ_OP_TRUNC; + req.gfid = cfg->gfid; + req.offset = cfg->trunc_size; + + unifyfs_rc urc = unifyfs_dispatch_io(cfg->fshdl, 1, &req); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "unifyfs_dispatch_io(%s, OP_TRUNC) failed - %s", + cfg->filename, unifyfs_rc_enum_description(urc)); + return -1; + } + urc = unifyfs_wait_io(cfg->fshdl, 1, &req, 1); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "unifyfs_wait_io(%s, WAITALL) failed - %s", + cfg->filename, unifyfs_rc_enum_description(urc)); + return -1; + } + return 0; +#endif + } else if (cfg->use_mpiio) { MPI_Offset mpi_off = (MPI_Offset) cfg->trunc_size; MPI_CHECK(cfg, (MPI_File_set_size(cfg->mpifh, mpi_off))); } else { @@ -227,7 +323,20 @@ int write_sync(test_cfg* cfg) assert(NULL != cfg); - if (NULL != cfg->fp) { // fflush(3) + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + unifyfs_rc urc = unifyfs_sync(cfg->fshdl, cfg->gfid); + if (UNIFYFS_SUCCESS != rc) { + test_print(cfg, "unifyfs_sync(%s, gfid=%d) failed - %s", + cfg->filename, cfg->gfid, + unifyfs_rc_enum_description(urc)); + return -1; + } + return 0; +#endif + } else if (NULL != cfg->fp) { // fflush(3) rc = fflush(cfg->fp); if (-1 == rc) { test_print(cfg, "fflush() failed"); @@ -258,12 +367,25 @@ int write_laminate(test_cfg* cfg, const char* filepath) * we use the same process that created the file */ int rc = 0; if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { - /* laminate by setting permissions to read-only */ - int chmod_rc = chmod(filepath, 0444); - if (-1 == chmod_rc) { - /* lamination failed */ - test_print(cfg, "chmod() during lamination failed"); - rc = -1; + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + unifyfs_rc urc = unifyfs_laminate(cfg->fshdl, filepath); + if (UNIFYFS_SUCCESS != rc) { + test_print(cfg, "unifyfs_laminate(%s) failed - %s", + cfg->filename, unifyfs_rc_enum_description(urc)); + rc = -1; + } +#endif + } else { + /* laminate by setting permissions to read-only */ + int chmod_rc = chmod(filepath, 0444); + if (-1 == chmod_rc) { + /* lamination failed */ + test_print(cfg, "chmod() during lamination failed"); + rc = -1; + } } } if (cfg->io_pattern == IO_PATTERN_N1) { @@ -277,11 +399,26 @@ int stat_file(test_cfg* cfg, const char* filepath) { int rc = 0; if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { - struct stat s; - int stat_rc = stat(filepath, &s); - if (-1 == stat_rc) { - test_print(cfg, "ERROR - stat(%s) failed", filepath); - rc = -1; + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + unifyfs_status us; + unifyfs_rc urc = unifyfs_stat(cfg->fshdl, cfg->gfid, &us); + if (UNIFYFS_SUCCESS != rc) { + test_print(cfg, "unifyfs_stat(%s, gfid=%d) failed - %s", + cfg->filename, cfg->gfid, + unifyfs_rc_enum_description(urc)); + rc = -1; + } +#endif + } else { + struct stat s; + int stat_rc = stat(filepath, &s); + if (-1 == stat_rc) { + test_print(cfg, "ERROR - stat(%s) failed", filepath); + rc = -1; + } } } return rc; @@ -289,6 +426,9 @@ int stat_file(test_cfg* cfg, const char* filepath) /* -------- Read Helper Methods -------- */ +static inline +int issue_read_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs); + static inline int issue_read_req(test_cfg* cfg, struct aiocb* req) { @@ -307,6 +447,8 @@ int issue_read_req(test_cfg* cfg, struct aiocb* req) test_print(cfg, "aio_read() failed"); } return rc; + } else if (cfg->use_api || cfg->use_lio) { + return issue_read_req_batch(cfg, 1, req); } else if (cfg->use_mapio) { // mmap(2) return ENOTSUP; } else if (cfg->use_mpiio) { // MPI-IO @@ -314,8 +456,13 @@ int issue_read_req(test_cfg* cfg, struct aiocb* req) MPI_Offset off = (MPI_Offset) req->aio_offset; void* dst_buf = (void*) req->aio_buf; int count = (int) req->aio_nbytes; - MPI_CHECK(cfg, (MPI_File_read_at(cfg->mpifh, off, dst_buf, - count, MPI_CHAR, &mst))); + if (req->aio_fildes == cfg->fd) { + MPI_CHECK(cfg, (MPI_File_read_at(cfg->mpifh, off, dst_buf, + count, MPI_CHAR, &mst))); + } else if (req->aio_fildes == cfg->dest_fd) { + MPI_CHECK(cfg, (MPI_File_read_at(cfg->dest_mpifh, off, dst_buf, + count, MPI_CHAR, &mst))); + } } else if (cfg->use_prdwr) { // pread(2) nread = 0; remaining = req->aio_nbytes; @@ -382,7 +529,37 @@ int issue_read_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) assert(NULL != cfg); - if (cfg->use_lio) { // lio_listio(2) + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + if (unify_reads.reads != NULL) { + /* release allocation from previous call */ + free(unify_reads.reads); + } + unify_reads.n_reqs = n_reqs; + unify_reads.reqs = reqs; + unify_reads.reads = calloc(n_reqs, sizeof(unifyfs_io_request)); + for (size_t i = 0; i < n_reqs; i++) { + struct aiocb* req = reqs + i; + unifyfs_io_request* rd = unify_reads.reads + i; + rd->op = UNIFYFS_IOREQ_OP_READ; + rd->gfid = cfg->gfid; + rd->nbytes = req->aio_nbytes; + rd->offset = req->aio_offset; + rd->user_buf = (void*) req->aio_buf; + } + + unifyfs_rc urc = unifyfs_dispatch_io(cfg->fshdl, n_reqs, + unify_reads.reads); + if (UNIFYFS_SUCCESS != rc) { + test_print(cfg, "unifyfs_dispatch_io(%s, OP_READ) failed - %s", + cfg->filename, unifyfs_rc_enum_description(urc)); + return -1; + } + return 0; +#endif + } else if (cfg->use_lio) { // lio_listio(2) struct aiocb* lio_vec[n_reqs]; for (i = 0; i < n_reqs; i++) { lio_vec[i] = reqs + i; @@ -414,6 +591,9 @@ int issue_read_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) } } +static inline +int wait_read_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs); + static inline int wait_read_req(test_cfg* cfg, struct aiocb* req) { @@ -444,6 +624,8 @@ int wait_read_req(test_cfg* cfg, struct aiocb* req) ss, req->aio_nbytes); return -1; } + } else if (cfg->use_api) { + return wait_read_req_batch(cfg, 1, req); } return 0; } @@ -456,6 +638,26 @@ int wait_read_req_batch(test_cfg* cfg, size_t n_reqs, struct aiocb* reqs) assert(NULL != cfg); + if (cfg->use_api) { +#ifdef DISABLE_UNIFYFS + return ENOTSUP; +#else + if ((unify_reads.n_reqs != n_reqs) || (unify_reads.reqs != reqs)) { + /* wait args do not match previous read batch */ + test_print(cfg, "mismatched wait on unify_reads"); + return EINVAL; + } + unifyfs_rc urc = unifyfs_wait_io(cfg->fshdl, n_reqs, + unify_reads.reads, 1); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "unifyfs_wait_io(%s, WAITALL) failed - %s", + cfg->filename, unifyfs_rc_enum_description(urc)); + return -1; + } + return 0; +#endif + } + for (i = 0; i < n_reqs; i++) { rc = wait_read_req(cfg, reqs + i); if (rc) { diff --git a/examples/src/transfer.c b/examples/src/transfer.c index e31c297f0..8697dbba3 100644 --- a/examples/src/transfer.c +++ b/examples/src/transfer.c @@ -34,19 +34,16 @@ #include "testlib.h" + +static unsigned long bufsize = 64 * (1 << 10); + +/* options */ static int rank; static int total_ranks; static int rank_worker; static int parallel; static int debug; - -static char* mountpoint = "/unifyfs"; /* unifyfs mountpoint */ -static int unmount; /* unmount unifyfs after running the test */ - -static char* srcpath; -static char* dstpath; - -static unsigned long bufsize = 64 * (1 << 10); +static char* mountpoint; /* unifyfs mountpoint */ static struct option long_opts[] = { { "debug", 0, 0, 'd' }, @@ -54,11 +51,10 @@ static struct option long_opts[] = { { "mount", 1, 0, 'm' }, { "parallel", 0, 0, 'p' }, { "rank", 1, 0, 'r' }, - { "unmount", 0, 0, 'u' }, { 0, 0, 0, 0}, }; -static char* short_opts = "dhm:pr:u"; +static char* short_opts = "dhm:pr:"; static const char* usage_str = "\n" @@ -72,7 +68,6 @@ static const char* usage_str = " (default: /unifyfs)\n" " -p, --parallel parallel transfer\n" " -r, --rank= use for transfer (default: 0)\n" - " -u, --unmount unmount the filesystem after test\n" "\n"; static char* program; @@ -85,11 +80,16 @@ static void print_usage(void) int main(int argc, char** argv) { + int mounted = 0; int ret = 0; int ch = 0; int optidx = 0; struct stat sb = { 0, }; + size_t srclen; + char* srcpath; + char* dstpath; + program = basename(strdup(argv[0])); MPI_Init(&argc, &argv); @@ -117,10 +117,10 @@ int main(int argc, char** argv) case 'r': rank_worker = atoi(optarg); - break; - - case 'u': - unmount = 1; + if (rank_worker >= total_ranks) { + test_print(rank, "ERROR - %d is not a valid rank"); + print_usage(); + } break; case 'h': @@ -130,60 +130,61 @@ int main(int argc, char** argv) } } + /* get source and destination files */ if (argc - optind != 2) { print_usage(); } - srcpath = strdup(argv[optind++]); dstpath = strdup(argv[optind++]); - if (srcpath[strlen(srcpath) - 1] == '/') { - srcpath[strlen(srcpath) - 1] = '\0'; + srclen = strlen(srcpath); + if (srcpath[srclen - 1] == '/') { + srcpath[srclen - 1] = '\0'; } if (debug) { - test_pause(rank, "Attempting to mount"); + test_pause(rank, "Before mounting UnifyFS"); } - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); - if (ret) { - test_print(rank, "unifyfs_mount failed (return = %d)", ret); - goto out; + if (NULL == mountpoint) { + mountpoint = strdup("/unifyfs"); } - if (parallel) { - ret = unifyfs_transfer_file_parallel(srcpath, dstpath); - if (ret) { - test_print(rank, "copy failed (%d: %s)", ret, strerror(ret)); - } + ret = unifyfs_mount(mountpoint, rank, total_ranks); + if (ret) { + test_print(rank, "unifyfs_mount(%s) failed (rc=%d)", + mountpoint, ret); } else { - if (rank_worker >= total_ranks) { - test_print(rank, "%d is not a valid rank"); - goto out; - } + mounted = 1; + } - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); - if (rank == rank_worker) { + if (mounted) { + if (parallel) { + ret = unifyfs_transfer_file_parallel(srcpath, dstpath); + if (ret) { + test_print(rank, "paralled transfer failed (rc=%d: %s)", + ret, strerror(ret)); + } + } else if (rank == rank_worker) { ret = unifyfs_transfer_file_serial(srcpath, dstpath); if (ret) { - test_print(rank, "copy failed (%d: %s)", ret, strerror(ret)); + test_print(rank, "serial transfer failed (rc=%d: %s)", + ret, strerror(ret)); } } + unifyfs_unmount(); } - free(dstpath); - free(srcpath); - MPI_Barrier(MPI_COMM_WORLD); - if (unmount) { - unifyfs_unmount(); - } - -out: MPI_Finalize(); + free(mountpoint); + free(srcpath); + free(dstpath); + return ret; } diff --git a/examples/src/write-transfer.c b/examples/src/write-transfer.c new file mode 100644 index 000000000..9e8122cc4 --- /dev/null +++ b/examples/src/write-transfer.c @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include "testutil.h" +#include "testutil_rdwr.h" + +// generate N-to-1 or N-to-N writes according to test config +size_t generate_write_reqs(test_cfg* cfg, char* srcbuf, + struct aiocb** reqs_out) +{ + off_t blk_off, chk_off; + size_t i, j, ndx = 0; + size_t blk_sz = cfg->block_sz; + size_t tran_sz = cfg->chunk_sz; + size_t n_tran_per_blk = blk_sz / tran_sz; + int rankbyte = (int)'0' + cfg->rank; + + size_t num_reqs = cfg->n_blocks * n_tran_per_blk; + struct aiocb* req; + struct aiocb* reqs = calloc(num_reqs, sizeof(struct aiocb)); + if (NULL == reqs) { + *reqs_out = NULL; + return 0; + } + + req = reqs; + for (i = 0; i < cfg->n_blocks; i++) { + // interleaved block writes + blk_off = (i * blk_sz * cfg->n_ranks) + (cfg->rank * blk_sz); + + if (cfg->io_check) { + // generate lipsum in source block + lipsum_generate((srcbuf + ndx), blk_sz, blk_off); + } else { + // fill srcbuf with unique rank character + memset((srcbuf + ndx), rankbyte, blk_sz); + } + + for (j = 0; j < n_tran_per_blk; j++) { + chk_off = blk_off + (j * tran_sz); + + req->aio_fildes = cfg->fd; + req->aio_buf = (void*)(srcbuf + ndx); + req->aio_nbytes = tran_sz; + req->aio_offset = chk_off; + req->aio_lio_opcode = LIO_WRITE; + + req++; + ndx += tran_sz; + } + } + + *reqs_out = reqs; + return num_reqs; +} + +// transfer test target file to the destination file using UnifyFS library API +int transfer_target_to_destination(test_cfg* cfg, + char* target_file, + char* dest_file) +{ + int ret = 0; + // use a single rank to initiate and wait for completion of transfer + if (cfg->rank == 0) { + unifyfs_transfer_request transfer = {0}; + transfer.src_path = target_file; + transfer.dst_path = dest_file; + transfer.mode = UNIFYFS_TRANSFER_MODE_MOVE; + transfer.use_parallel = 1; + unifyfs_rc urc = unifyfs_dispatch_transfer(cfg->fshdl, 1, &transfer); + if (urc != UNIFYFS_SUCCESS) { + fprintf(stderr, "ERROR - Test transfer dispatch failed! %s", + unifyfs_rc_enum_description(urc)); + ret = urc; + } else { + urc = unifyfs_wait_transfer(cfg->fshdl, 1, &transfer, 1); + if (urc != UNIFYFS_SUCCESS) { + fprintf(stderr, "ERROR - Test transfer wait failed! %s", + unifyfs_rc_enum_description(urc)); + ret = urc; + } + } + } + return ret; +} + +// generate N-to-1 or N-to-N reads according to test config +size_t generate_read_reqs(test_cfg* cfg, char* dstbuf, + struct aiocb** reqs_out) +{ + int read_rank = cfg->rank; + off_t blk_off, chk_off; + size_t i, j, ndx = 0; + size_t blk_sz = cfg->block_sz; + size_t tran_sz = cfg->chunk_sz; + size_t n_tran_per_blk = blk_sz / tran_sz; + + size_t num_reqs = cfg->n_blocks * n_tran_per_blk; + struct aiocb* req; + struct aiocb* reqs = calloc(num_reqs, sizeof(struct aiocb)); + if (NULL == reqs) { + *reqs_out = NULL; + return 0; + } + + if (cfg->io_shuffle) { + // 0 reads data written by N-1, N-1 reads from 0, etc. + read_rank = (cfg->n_ranks - 1) - cfg->rank; + } + + req = reqs; + for (i = 0; i < cfg->n_blocks; i++) { + // interleaved block writes + blk_off = (i * blk_sz * cfg->n_ranks) + + (read_rank * blk_sz); + + for (j = 0; j < n_tran_per_blk; j++) { + chk_off = blk_off + (j * tran_sz); + + req->aio_fildes = cfg->dest_fd; + req->aio_buf = (void*)(dstbuf + ndx); + req->aio_nbytes = tran_sz; + req->aio_offset = chk_off; + req->aio_lio_opcode = LIO_READ; + + req++; + ndx += tran_sz; + } + } + + *reqs_out = reqs; + return num_reqs; +} + + +/* -------- Main Program -------- */ + +/* Description: + * + * [ Mode 1: N-to-1 shared-file ] + * Each rank writes cfg.n_blocks blocks of size cfg.block_sz to the + * shared file, using I/O operation sizes of cfg.chunk_sz. Blocks are + * rank-interleaved (i.e., the block at offset 0 is written by rank 0, + * the block at offset cfg.block_sz is written by rank 1, and so on). + * + * After writing all blocks, the written data is synced (laminated) and + * the shared file is transferred using the library API to the + * destination path. + * + * Each rank then reads back the written blocks from the destination + * file using I/O operation sizes of cfg.chunk_sz. If cfg.io_shuffle is + * enabled, each rank will read different blocks than it wrote. + * + * [ Mode 2: N-to-N file-per-process ] + * This mode is not supported. + * + * [ Options for Both Modes ] + * + * cfg.use_lio - when enabled, lio_listio(3) will be used for batching + * reads and writes. + * + * cfg.use_mpiio - when enabled, MPI-IO will be used. + * + * cfg.use_prdwr - when enabled, pread(2) and pwrite(2) will be used. + * + * cfg.io_check - when enabled, lipsum data is used when writing + * the file and verified when reading. + * + */ + +int main(int argc, char* argv[]) +{ + char* wr_buf; + char* rd_buf; + char* target_file; + char* destination_file; + struct aiocb* reqs = NULL; + size_t num_reqs = 0; + int rc; + + test_cfg test_config; + test_cfg* cfg = &test_config; + + test_timer time_write; + test_timer time_sync; + test_timer time_transfer; + test_timer time_read; + + timer_init(&time_write, "write"); + timer_init(&time_sync, "sync"); + timer_init(&time_transfer, "transfer"); + timer_init(&time_read, "read"); + + rc = test_init(argc, argv, cfg); + if (rc) { + fprintf(stderr, "ERROR - Test %s initialization failed!", + argv[0]); + fflush(NULL); + return rc; + } + + if (cfg->io_pattern != IO_PATTERN_N1) { + fprintf(stderr, "ERROR - Test %s requires shared file pattern!", + argv[0]); + fflush(NULL); + return -1; + } + + if (!cfg->use_unifyfs) { + fprintf(stderr, "ERROR - Test %s requires UnifyFS!", + argv[0]); + fflush(NULL); + return -1; + } + + if (!test_config.use_mpi) { + fprintf(stderr, "ERROR - Test %s requires MPI!", + argv[0]); + fflush(NULL); + return -1; + } + + target_file = test_target_filename(cfg); + destination_file = test_destination_filename(cfg); + + if (NULL == destination_file) { + fprintf(stderr, "ERROR - Test %s requires destination file!", + argv[0]); + fflush(NULL); + return -1; + } + + // if reusing filename, remove old target file before starting timers + if (cfg->reuse_filename) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s for reuse", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + + // create file + test_print_verbose_once(cfg, + "DEBUG: creating target file %s", target_file); + rc = test_create_file(cfg, target_file, O_RDWR); + if (rc) { + test_abort(cfg, rc); + } + + // generate write requests + test_print_verbose_once(cfg, "DEBUG: generating write requests"); + wr_buf = calloc(test_config.n_blocks, test_config.block_sz); + if (NULL == wr_buf) { + test_abort(cfg, ENOMEM); + } + num_reqs = generate_write_reqs(cfg, wr_buf, &reqs); + if (0 == num_reqs) { + test_abort(cfg, ENOMEM); + } + + // do writes + test_print_verbose_once(cfg, "DEBUG: starting write requests"); + timer_start_barrier(cfg, &time_write); + rc = issue_write_req_batch(cfg, num_reqs, reqs); + if (rc) { + test_abort(cfg, rc); + } + rc = wait_write_req_batch(cfg, num_reqs, reqs); + if (rc) { + test_abort(cfg, rc); + } + timer_stop_barrier(cfg, &time_write); + test_print_verbose_once(cfg, + "DEBUG: finished write requests (elapsed=%.6lf sec)", + time_write.elapsed_sec_all); + + // sync + timer_start_barrier(cfg, &time_sync); + rc = write_sync(cfg); + if (rc) { + test_abort(cfg, rc); + } + timer_stop_barrier(cfg, &time_sync); + test_print_verbose_once(cfg, + "DEBUG: finished sync (elapsed=%.6lf sec)", + time_sync.elapsed_sec_all); + + // laminate + test_print_verbose_once(cfg, "DEBUG: laminating target file"); + rc = write_laminate(cfg, target_file); + if (rc) { + test_abort(cfg, rc); + } + + // stat file post-laminate + test_print_verbose_once(cfg, "DEBUG: calling stat() on target file"); + stat_cmd(cfg, target_file); + + // post-write cleanup + free(wr_buf); + free(reqs); + reqs = NULL; + + // need to initialize API handle for transfer + unifyfs_rc urc; + bool api_init = false; + if (cfg->fshdl == UNIFYFS_INVALID_HANDLE) { + urc = unifyfs_initialize(cfg->mountpt, NULL, 0, &(cfg->fshdl)); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "ERROR: unifyfs_initialize(%s) failed (%s)", + cfg->mountpt, unifyfs_rc_enum_description(urc)); + test_abort(cfg, (int)urc); + return -1; + } + api_init = true; + } + + // use single rank to initiate transfer to destination file + test_print_verbose_once(cfg, + "DEBUG: transferring target file to destination %s", + destination_file); + timer_start_barrier(cfg, &time_transfer); + rc = transfer_target_to_destination(cfg, target_file, destination_file); + timer_stop_barrier(cfg, &time_transfer); + test_print_verbose_once(cfg, + "DEBUG: finished transfer (elapsed=%.6lf sec)", + time_transfer.elapsed_sec_all); + + // we're done with API handle now + if (api_init) { + urc = unifyfs_finalize(cfg->fshdl); + if (UNIFYFS_SUCCESS != urc) { + test_print(cfg, "ERROR: unifyfs_finalize() failed - %s", + unifyfs_rc_enum_description(urc)); + } + cfg->fshdl = UNIFYFS_INVALID_HANDLE; + } + + // open the destination file + test_print_verbose_once(cfg, "DEBUG: opening destination file"); + rc = test_open_destination_file(cfg, O_RDONLY); + if (rc) { + test_abort(cfg, rc); + } + + // generate read requests + test_print_verbose_once(cfg, "DEBUG: generating read requests"); + rd_buf = calloc(test_config.n_blocks, test_config.block_sz); + if (NULL == rd_buf) { + test_abort(cfg, ENOMEM); + } + num_reqs = generate_read_reqs(cfg, rd_buf, &reqs); + if (0 == num_reqs) { + test_abort(cfg, ENOMEM); + } + + // do reads + test_print_verbose_once(cfg, "DEBUG: starting read requests"); + timer_start_barrier(cfg, &time_read); + rc = issue_read_req_batch(cfg, num_reqs, reqs); + if (rc) { + test_abort(cfg, rc); + } + rc = wait_read_req_batch(cfg, num_reqs, reqs); + if (rc) { + test_abort(cfg, rc); + } + timer_stop_barrier(cfg, &time_read); + test_print_verbose_once(cfg, + "DEBUG: finished read requests (elapsed=%.6lf sec)", + time_read.elapsed_sec_all); + + if (test_config.io_check) { + test_print_verbose_once(cfg, "DEBUG: verifying data"); + rc = check_read_req_batch(cfg, num_reqs, reqs); + } + + // post-read cleanup + free(rd_buf); + free(reqs); + reqs = NULL; + + // calculate achieved bandwidth rates + size_t rank_bytes = test_config.n_blocks * test_config.block_sz; + size_t total_bytes = rank_bytes * test_config.n_ranks; + + double max_global_write_time = test_reduce_double_max(cfg, + time_write.elapsed_sec_all); + double max_global_sync_time = test_reduce_double_max(cfg, + time_sync.elapsed_sec_all); + double global_write_bw = bandwidth_mib(total_bytes, max_global_write_time); + double global_write_sync_bw = bandwidth_mib(total_bytes, + max_global_write_time + max_global_sync_time); + + double global_transfer_bw = bandwidth_mib(total_bytes, + time_transfer.elapsed_sec_all); + + double max_global_read_time = test_reduce_double_max(cfg, + time_read.elapsed_sec_all); + double global_read_bw = bandwidth_mib(total_bytes, max_global_read_time); + + if (test_config.rank == 0) { + errno = 0; /* just in case there was an earlier error */ + test_print_once(cfg, "Target Write Time is %.6lf s", + max_global_write_time); + test_print_once(cfg, "Target Write BW is %.3lf MiB/s", + global_write_bw); + test_print_once(cfg, "Target Sync Time is %.6lf s", + max_global_sync_time); + test_print_once(cfg, "Target Write+Sync BW is %.3lf MiB/s", + global_write_sync_bw); + test_print_once(cfg, "Transfer Time is %.6lf s", + time_transfer.elapsed_sec_all); + test_print_once(cfg, "Transfer BW is %.3lf MiB/s", + global_transfer_bw); + test_print_once(cfg, "Destination Read BW is %.3lf MiB/s", + global_read_bw); + } + + // cleanup + free(target_file); + free(destination_file); + + timer_fini(&time_write); + timer_fini(&time_sync); + timer_fini(&time_transfer); + timer_fini(&time_read); + + test_fini(cfg); + + return 0; +} diff --git a/examples/src/writeread.c b/examples/src/writeread.c index 5b8f7b2ec..be8e4efc3 100644 --- a/examples/src/writeread.c +++ b/examples/src/writeread.c @@ -148,6 +148,9 @@ size_t generate_read_reqs(test_cfg* cfg, char* dstbuf, * cfg.use_aio - when enabled, aio(7) will be used for issuing and * completion of reads and writes. * + * cfg.use_api - when enabled, the UnifyFS library API will be used + * for issuing and completion of reads and writes. + * * cfg.use_lio - when enabled, lio_listio(3) will be used for batching * reads and writes. When cfg.use_aio is also enabled, the mode will * be LIO_NOWAIT. diff --git a/server/src/Makefile.am b/server/src/Makefile.am index 9e74de61e..ca5c62191 100644 --- a/server/src/Makefile.am +++ b/server/src/Makefile.am @@ -32,6 +32,8 @@ unifyfsd_SOURCES = \ unifyfs_service_manager.c \ unifyfs_service_manager.h \ unifyfs_server_pid.c \ + unifyfs_transfer.c \ + unifyfs_transfer.h \ unifyfs_tree.c \ unifyfs_tree.h diff --git a/server/src/extent_tree.h b/server/src/extent_tree.h index 0f0f2adf6..ef7d55369 100644 --- a/server/src/extent_tree.h +++ b/server/src/extent_tree.h @@ -27,6 +27,12 @@ struct extent_tree_node { unsigned long pos; /* physical offset of data in log */ }; +#define extent_tree_node_offset(node_ptr) \ + ((off_t)(node_ptr)->start) + +#define extent_tree_node_length(node_ptr) \ + ((size_t)1 + ((node_ptr)->end - (node_ptr)->start)) + struct extent_tree { RB_HEAD(ext_tree, extent_tree_node) head; pthread_rwlock_t rwlock; diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 86450bce9..1d7142233 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -194,6 +194,16 @@ static void register_server_server_rpcs(margo_instance_id mid) server_pid_in_t, server_pid_out_t, server_pid_rpc); + unifyfsd_rpc_context->rpcs.transfer_id = + MARGO_REGISTER(mid, "transfer_rpc", + transfer_in_t, transfer_out_t, + transfer_rpc); + + unifyfsd_rpc_context->rpcs.transfer_bcast_id = + MARGO_REGISTER(mid, "transfer_bcast_rpc", + transfer_bcast_in_t, transfer_bcast_out_t, + transfer_bcast_rpc); + unifyfsd_rpc_context->rpcs.truncate_id = MARGO_REGISTER(mid, "truncate_rpc", truncate_in_t, truncate_out_t, @@ -285,6 +295,10 @@ static void register_client_server_rpcs(margo_instance_id mid) unifyfs_filesize_in_t, unifyfs_filesize_out_t, unifyfs_filesize_rpc); + MARGO_REGISTER(mid, "unifyfs_transfer_rpc", + unifyfs_transfer_in_t, unifyfs_transfer_out_t, + unifyfs_transfer_rpc); + MARGO_REGISTER(mid, "unifyfs_truncate_rpc", unifyfs_truncate_in_t, unifyfs_truncate_out_t, unifyfs_truncate_rpc); @@ -313,6 +327,12 @@ static void register_client_server_rpcs(margo_instance_id mid) unifyfs_mread_req_complete_in_t, unifyfs_mread_req_complete_out_t, NULL); + + unifyfsd_rpc_context->rpcs.client_transfer_complete_id = + MARGO_REGISTER(mid, "unifyfs_transfer_complete_rpc", + unifyfs_transfer_complete_in_t, + unifyfs_transfer_complete_out_t, + NULL); } /* margo_server_rpc_init @@ -538,10 +558,11 @@ void* pull_margo_bulk_buffer(hg_handle_t rpc_hdl, * transfer size that the underlying transport supports, and a * large bulk transfer may result in failure. */ int i = 0; + hg_size_t max_bulk = UNIFYFS_SERVER_MAX_BULK_TX_SIZE; hg_size_t remain = bulk_sz; do { - hg_size_t offset = i * MAX_BULK_TX_SIZE; - hg_size_t len = remain < MAX_BULK_TX_SIZE ? remain : MAX_BULK_TX_SIZE; + hg_size_t offset = i * max_bulk; + hg_size_t len = (remain < max_bulk) ? remain : max_bulk; hret = margo_bulk_transfer(mid, HG_BULK_PULL, hgi->addr, bulk_remote, offset, bulk_local, offset, len); @@ -615,6 +636,8 @@ int invoke_client_mread_req_data_rpc(int app_id, /* fill input struct */ unifyfs_mread_req_data_in_t in; + in.app_id = (int32_t) app_id; + in.client_id = (int32_t) client_id; in.mread_id = (int32_t) mread_id; in.read_index = (int32_t) read_index; in.read_offset = (hg_size_t) read_offset; @@ -682,6 +705,8 @@ int invoke_client_mread_req_complete_rpc(int app_id, /* fill input struct */ unifyfs_mread_req_complete_in_t in; + in.app_id = (int32_t) app_id; + in.client_id = (int32_t) client_id; in.mread_id = (int32_t) mread_id; in.read_index = (int32_t) read_index; in.read_error = (int32_t) read_error; @@ -718,3 +743,56 @@ int invoke_client_mread_req_complete_rpc(int app_id, return ret; } + +/* invokes the client mread request completion rpc function */ +int invoke_client_transfer_complete_rpc(int app_id, + int client_id, + int transfer_id, + int error_code) +{ + hg_return_t hret; + + /* check that we have initialized margo */ + if (NULL == unifyfsd_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* fill input struct */ + unifyfs_transfer_complete_in_t in; + in.app_id = (int32_t) app_id; + in.client_id = (int32_t) client_id; + in.transfer_id = (int32_t) transfer_id; + in.error_code = (int32_t) error_code; + + /* get handle to rpc function */ + hg_id_t rpc_id = unifyfsd_rpc_context->rpcs.client_transfer_complete_id; + hg_handle_t handle = create_client_handle(rpc_id, app_id, client_id); + + /* call rpc function */ + LOGDBG("invoking the transfer[%d] complete rpc function in client", + transfer_id); + hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; + unifyfs_transfer_complete_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ + margo_destroy(handle); + + return ret; +} diff --git a/server/src/margo_server.h b/server/src/margo_server.h index 151fb641d..5209d7353 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -44,6 +44,8 @@ typedef struct ServerRpcIds { hg_id_t metaset_id; hg_id_t fileattr_bcast_id; hg_id_t server_pid_id; + hg_id_t transfer_id; + hg_id_t transfer_bcast_id; hg_id_t truncate_id; hg_id_t truncate_bcast_id; hg_id_t unlink_bcast_id; @@ -51,6 +53,7 @@ typedef struct ServerRpcIds { /* client-server rpcs */ hg_id_t client_mread_data_id; hg_id_t client_mread_complete_id; + hg_id_t client_transfer_complete_id; } server_rpcs_t; typedef struct ServerRpcContext { @@ -75,9 +78,9 @@ hg_addr_t get_margo_server_address(int rank); /* use passed bulk handle to pull data into a newly allocated buffer. * returns buffer, or NULL on failure. */ void* pull_margo_bulk_buffer(hg_handle_t rpc_hdl, - hg_bulk_t bulk_in, - hg_size_t bulk_sz, - hg_bulk_t* local_bulk); + hg_bulk_t bulk_in, + hg_size_t bulk_sz, + hg_bulk_t* local_bulk); /* invokes the client mread request data response rpc function */ int invoke_client_mread_req_data_rpc(int app_id, @@ -95,4 +98,10 @@ int invoke_client_mread_req_complete_rpc(int app_id, int read_index, int read_error); +/* invokes the client transfer request completion rpc function */ +int invoke_client_transfer_complete_rpc(int app_id, + int client_id, + int transfer_id, + int error_code); + #endif // MARGO_SERVER_H diff --git a/server/src/unifyfs_client_rpc.c b/server/src/unifyfs_client_rpc.c index f8c8f9c0f..f9333377a 100644 --- a/server/src/unifyfs_client_rpc.c +++ b/server/src/unifyfs_client_rpc.c @@ -158,7 +158,7 @@ static void unifyfs_mount_rpc(hg_handle_t handle) app_id, (int)in.dbg_rank); ret = (int)UNIFYFS_FAILURE; } else { - client_id = client->client_id; + client_id = client->state.client_id; LOGDBG("created new application client %d:%d", app_id, client_id); if (create_mountpoint) { @@ -542,6 +542,68 @@ static void unifyfs_filesize_rpc(hg_handle_t handle) } DEFINE_MARGO_RPC_HANDLER(unifyfs_filesize_rpc) +/* given an app_id, client_id, global file id, transfer mode + * and destination file, transfer data to that file */ +static void unifyfs_transfer_rpc(hg_handle_t handle) +{ + int ret = UNIFYFS_SUCCESS; + hg_return_t hret; + + /* get input params */ + unifyfs_transfer_in_t* in = malloc(sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + client_rpc_req_t* req = malloc(sizeof(client_rpc_req_t)); + if (NULL == req) { + ret = ENOMEM; + } else { + unifyfs_fops_ctx_t ctx = { + .app_id = in->app_id, + .client_id = in->client_id, + }; + req->req_type = UNIFYFS_CLIENT_RPC_TRANSFER; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = rm_submit_client_rpc_request(&ctx, req); + } + + if (ret != UNIFYFS_SUCCESS) { + if (NULL != req) { + free(req); + } + margo_free_input(handle, in); + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + + /* return to caller */ + unifyfs_transfer_out_t out; + out.ret = (int32_t) ret; + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_transfer_rpc) + /* given an app_id, client_id, global file id, * and file size, truncate file to that size */ static void unifyfs_truncate_rpc(hg_handle_t handle) diff --git a/server/src/unifyfs_fops.h b/server/src/unifyfs_fops.h index 58f67bcfd..3f3d7f3a2 100644 --- a/server/src/unifyfs_fops.h +++ b/server/src/unifyfs_fops.h @@ -31,6 +31,13 @@ typedef struct _unifyfs_fops_ctx unifyfs_fops_ctx_t; typedef int (*unifyfs_fops_init_t)(unifyfs_cfg_t* cfg); +typedef int (*unifyfs_fops_fsync_t)(unifyfs_fops_ctx_t* ctx, int gfid); + +typedef int (*unifyfs_fops_filesize_t)(unifyfs_fops_ctx_t* ctx, + int gfid, size_t* filesize); + +typedef int (*unifyfs_fops_laminate_t)(unifyfs_fops_ctx_t* ctx, int gfid); + typedef int (*unifyfs_fops_metaget_t)(unifyfs_fops_ctx_t* ctx, int gfid, unifyfs_file_attr_t* attr); @@ -38,36 +45,37 @@ typedef int (*unifyfs_fops_metaset_t)(unifyfs_fops_ctx_t* ctx, int gfid, int attr_op, unifyfs_file_attr_t* attr); -typedef int (*unifyfs_fops_fsync_t)(unifyfs_fops_ctx_t* ctx, int gfid); +typedef int (*unifyfs_fops_mread_t)(unifyfs_fops_ctx_t* ctx, + size_t n_req, void* req); -typedef int (*unifyfs_fops_filesize_t)(unifyfs_fops_ctx_t* ctx, - int gfid, size_t* filesize); +typedef int (*unifyfs_fops_read_t)(unifyfs_fops_ctx_t* ctx, + int gfid, off_t offset, size_t len); + + +typedef int (*unifyfs_fops_transfer_t)(unifyfs_fops_ctx_t* ctx, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file); typedef int (*unifyfs_fops_truncate_t)(unifyfs_fops_ctx_t* ctx, int gfid, off_t len); -typedef int (*unifyfs_fops_laminate_t)(unifyfs_fops_ctx_t* ctx, int gfid); - typedef int (*unifyfs_fops_unlink_t)(unifyfs_fops_ctx_t* ctx, int gfid); -typedef int (*unifyfs_fops_read_t)(unifyfs_fops_ctx_t* ctx, - int gfid, off_t offset, size_t len); - -typedef int (*unifyfs_fops_mread_t)(unifyfs_fops_ctx_t* ctx, - size_t n_req, void* req); - struct unifyfs_fops { const char* name; unifyfs_fops_init_t init; + unifyfs_fops_filesize_t filesize; + unifyfs_fops_fsync_t fsync; + unifyfs_fops_laminate_t laminate; unifyfs_fops_metaget_t metaget; unifyfs_fops_metaset_t metaset; - unifyfs_fops_fsync_t fsync; - unifyfs_fops_filesize_t filesize; + unifyfs_fops_mread_t mread; + unifyfs_fops_read_t read; + unifyfs_fops_transfer_t transfer; unifyfs_fops_truncate_t truncate; - unifyfs_fops_laminate_t laminate; unifyfs_fops_unlink_t unlink; - unifyfs_fops_read_t read; - unifyfs_fops_mread_t mread; }; /* available file operations. */ @@ -98,25 +106,14 @@ static inline int unifyfs_fops_init(unifyfs_cfg_t* cfg) return ret; } -static inline int unifyfs_fops_metaget(unifyfs_fops_ctx_t* ctx, - int gfid, unifyfs_file_attr_t* attr) -{ - if (!global_fops_tab->metaget) { - return ENOSYS; - } - - return global_fops_tab->metaget(ctx, gfid, attr); -} - -static inline int unifyfs_fops_metaset(unifyfs_fops_ctx_t* ctx, - int gfid, int attr_op, - unifyfs_file_attr_t* attr) +static inline int unifyfs_fops_filesize(unifyfs_fops_ctx_t* ctx, + int gfid, size_t* filesize) { - if (!global_fops_tab->metaset) { + if (!global_fops_tab->filesize) { return ENOSYS; } - return global_fops_tab->metaset(ctx, gfid, attr_op, attr); + return global_fops_tab->filesize(ctx, gfid, filesize); } static inline int unifyfs_fops_fsync(unifyfs_fops_ctx_t* ctx, int gfid) @@ -128,42 +125,44 @@ static inline int unifyfs_fops_fsync(unifyfs_fops_ctx_t* ctx, int gfid) return global_fops_tab->fsync(ctx, gfid); } -static inline int unifyfs_fops_filesize(unifyfs_fops_ctx_t* ctx, - int gfid, size_t* filesize) +static inline int unifyfs_fops_laminate(unifyfs_fops_ctx_t* ctx, int gfid) { - if (!global_fops_tab->filesize) { + if (!global_fops_tab->laminate) { return ENOSYS; } - return global_fops_tab->filesize(ctx, gfid, filesize); + return global_fops_tab->laminate(ctx, gfid); } -static inline int unifyfs_fops_truncate(unifyfs_fops_ctx_t* ctx, - int gfid, off_t len) +static inline int unifyfs_fops_metaget(unifyfs_fops_ctx_t* ctx, + int gfid, unifyfs_file_attr_t* attr) { - if (!global_fops_tab->truncate) { + if (!global_fops_tab->metaget) { return ENOSYS; } - return global_fops_tab->truncate(ctx, gfid, len); + return global_fops_tab->metaget(ctx, gfid, attr); } -static inline int unifyfs_fops_laminate(unifyfs_fops_ctx_t* ctx, int gfid) +static inline int unifyfs_fops_metaset(unifyfs_fops_ctx_t* ctx, + int gfid, int attr_op, + unifyfs_file_attr_t* attr) { - if (!global_fops_tab->laminate) { + if (!global_fops_tab->metaset) { return ENOSYS; } - return global_fops_tab->laminate(ctx, gfid); + return global_fops_tab->metaset(ctx, gfid, attr_op, attr); } -static inline int unifyfs_fops_unlink(unifyfs_fops_ctx_t* ctx, int gfid) +static inline int unifyfs_fops_mread(unifyfs_fops_ctx_t* ctx, + size_t n_req, void* reqs) { - if (!global_fops_tab->unlink) { + if (!global_fops_tab->mread) { return ENOSYS; } - return global_fops_tab->unlink(ctx, gfid); + return global_fops_tab->mread(ctx, n_req, reqs); } static inline int unifyfs_fops_read(unifyfs_fops_ctx_t* ctx, @@ -178,14 +177,39 @@ static inline int unifyfs_fops_read(unifyfs_fops_ctx_t* ctx, return global_fops_tab->read(ctx, gfid, offset, len); } -static inline int unifyfs_fops_mread(unifyfs_fops_ctx_t* ctx, - size_t n_req, void* reqs) +static inline int unifyfs_fops_transfer(unifyfs_fops_ctx_t* ctx, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file) { - if (!global_fops_tab->mread) { + if (!global_fops_tab->transfer) { return ENOSYS; } - return global_fops_tab->mread(ctx, n_req, reqs); + return global_fops_tab->transfer(ctx, transfer_id, gfid, + transfer_mode, dest_file); } +static inline int unifyfs_fops_truncate(unifyfs_fops_ctx_t* ctx, + int gfid, off_t len) +{ + if (!global_fops_tab->truncate) { + return ENOSYS; + } + + return global_fops_tab->truncate(ctx, gfid, len); +} + +static inline int unifyfs_fops_unlink(unifyfs_fops_ctx_t* ctx, int gfid) +{ + if (!global_fops_tab->unlink) { + return ENOSYS; + } + + return global_fops_tab->unlink(ctx, gfid); +} + + + #endif /* __UNIFYFS_FOPS_H */ diff --git a/server/src/unifyfs_fops_mdhim.c b/server/src/unifyfs_fops_mdhim.c index 00bcdbcb5..a76a1a431 100644 --- a/server/src/unifyfs_fops_mdhim.c +++ b/server/src/unifyfs_fops_mdhim.c @@ -195,7 +195,7 @@ static int mdhim_fsync(unifyfs_fops_ctx_t* ctx, int gfid) } /* get pointer to superblock for this client and app */ - shm_context* super_ctx = client->shmem_super; + shm_context* super_ctx = client->state.shm_super_ctx; if (NULL == super_ctx) { LOGERR("missing client superblock"); return UNIFYFS_FAILURE; @@ -203,7 +203,7 @@ static int mdhim_fsync(unifyfs_fops_ctx_t* ctx, int gfid) char* superblk = (char*)(super_ctx->addr); /* get pointer to start of key/value region in superblock */ - char* meta = superblk + client->super_meta_offset; + char* meta = superblk + client->state.write_index.index_offset; /* get number of file extent index values client has for us, * stored as a size_t value in meta region of shared memory */ @@ -229,7 +229,7 @@ static int mdhim_fsync(unifyfs_fops_ctx_t* ctx, int gfid) size_t length = meta_payload[i].length; slices += meta_num_slices(offset, length); } - if (slices >= UNIFYFS_MAX_SPLIT_CNT) { + if (slices >= UNIFYFS_MAX_META_SPLIT_COUNT) { LOGERR("Error allocating buffers"); return ENOMEM; } @@ -767,7 +767,7 @@ static int get_local_keyvals( *keyvals = NULL; /* allocate memory to copy key/value data */ - int max_keyvals = UNIFYFS_MAX_SPLIT_CNT; + int max_keyvals = UNIFYFS_MAX_META_SPLIT_COUNT; unifyfs_keyval_t* kvs_local = (unifyfs_keyval_t*) calloc( max_keyvals, sizeof(unifyfs_keyval_t)); if (NULL == kvs_local) { @@ -812,13 +812,14 @@ static int get_local_keyvals( /* we'll define key/values in these temp arrays that correspond * to extents we have locally */ - unifyfs_key_t tmpkeys[UNIFYFS_MAX_SPLIT_CNT]; - unifyfs_val_t tmpvals[UNIFYFS_MAX_SPLIT_CNT]; + unifyfs_key_t tmpkeys[UNIFYFS_MAX_META_SPLIT_COUNT]; + unifyfs_val_t tmpvals[UNIFYFS_MAX_META_SPLIT_COUNT]; /* look up any entries we can find in our local extent map */ int num_local = 0; int ret = unifyfs_inode_span_extents(gfid, start, end, - UNIFYFS_MAX_SPLIT_CNT, tmpkeys, tmpvals, &num_local); + UNIFYFS_MAX_META_SPLIT_COUNT, + tmpkeys, tmpvals, &num_local); if (ret) { LOGERR("failed to span extents (gfid=%d)", gfid); // now what? @@ -957,8 +958,8 @@ static int create_gfid_chunk_reads(reqmgr_thrd_t* thrd_ctrl, int gfid, /* this is to maintain limits imposed in previous code * that would throw fatal errors */ - if (num_vals >= UNIFYFS_MAX_SPLIT_CNT || - num_vals >= MAX_META_PER_SEND) { + if (num_vals >= UNIFYFS_MAX_META_SPLIT_COUNT || + num_vals >= UNIFYFS_MAX_META_PER_SEND) { LOGERR("too many key/values returned in range lookup"); if (NULL != keyvals) { free(keyvals); @@ -1033,7 +1034,7 @@ static int mdhim_read(unifyfs_fops_ctx_t* ctx, /* count number of slices this range covers */ size_t slices = meta_num_slices(offset, length); - if (slices >= UNIFYFS_MAX_SPLIT_CNT) { + if (slices >= UNIFYFS_MAX_META_SPLIT_COUNT) { LOGERR("Error allocating buffers"); return ENOMEM; } @@ -1095,7 +1096,7 @@ static int mdhim_mread(unifyfs_fops_ctx_t* ctx, size_t num_req, void* reqbuf) /* add in number of slices this request needs */ slices += meta_num_slices(off, len); } - if (slices >= UNIFYFS_MAX_SPLIT_CNT) { + if (slices >= UNIFYFS_MAX_META_SPLIT_COUNT) { LOGERR("Error allocating buffers"); return ENOMEM; } diff --git a/server/src/unifyfs_fops_rpc.c b/server/src/unifyfs_fops_rpc.c index c0f0c3a47..e74c1591d 100644 --- a/server/src/unifyfs_fops_rpc.c +++ b/server/src/unifyfs_fops_rpc.c @@ -23,16 +23,9 @@ static int rpc_init(unifyfs_cfg_t* cfg) { - int ret = 0; - long range_sz = 0; - - LOGDBG("initializing file operations.."); + int ret = UNIFYFS_SUCCESS; - ret = configurator_int_val(cfg->meta_range_size, &range_sz); - if (ret != 0) { - LOGERR("failed to read configuration (meta_range_size)"); - } - meta_slice_sz = (size_t) range_sz; + LOGDBG("initializing RPC file operations.."); return ret; } @@ -66,30 +59,12 @@ int rpc_fsync(unifyfs_fops_ctx_t* ctx, /* assume we'll succeed */ int ret = UNIFYFS_SUCCESS; - /* get memory page size on this machine */ - int page_sz = (int) get_page_size(); - /* get application client */ app_client* client = get_app_client(ctx->app_id, ctx->client_id); if (NULL == client) { return EINVAL; } - /* get pointer to superblock for this client and app */ - shm_context* super_ctx = client->shmem_super; - if (NULL == super_ctx) { - LOGERR("missing client superblock"); - return UNIFYFS_FAILURE; - } - char* superblk = (char*)(super_ctx->addr); - - /* get pointer to start of key/value region in superblock */ - char* meta = superblk + client->super_meta_offset; - - /* get number of file extent index values client has for us, - * stored as a size_t value in meta region of shared memory */ - size_t num_extents = *(size_t*)(meta); - /* indices are stored in the superblock shared memory * created by the client, these are stored as index_t * structs starting one page size offset into meta region @@ -97,13 +72,19 @@ int rpc_fsync(unifyfs_fops_ctx_t* ctx, * Is it safe to assume that the index information in this superblock is * not going to be modified by the client while we perform this operation? */ - char* ptr_extents = meta + page_sz; + + /* get number of file extent index values client has for us, + * stored as a size_t value in index region of shared memory */ + size_t num_extents = *(client->state.write_index.ptr_num_entries); if (num_extents == 0) { return UNIFYFS_SUCCESS; /* Nothing to do */ } - unifyfs_index_t* meta_payload = (unifyfs_index_t*)(ptr_extents); + unifyfs_index_t* index_entry = client->state.write_index.index_entries; + + /* the sync rpc now contains extents from a single file/gfid */ + assert(gfid == index_entry[0].gfid); struct extent_tree_node* extents = calloc(num_extents, sizeof(*extents)); if (NULL == extents) { @@ -111,12 +92,9 @@ int rpc_fsync(unifyfs_fops_ctx_t* ctx, return ENOMEM; } - /* the sync rpc now contains extents from a single file/gfid */ - assert(gfid == meta_payload[0].gfid); - for (i = 0; i < num_extents; i++) { - struct extent_tree_node* extent = &extents[i]; - unifyfs_index_t* meta = &meta_payload[i]; + struct extent_tree_node* extent = extents + i; + unifyfs_index_t* meta = index_entry + i; extent->start = meta->file_pos; extent->end = (meta->file_pos + meta->length) - 1; @@ -152,6 +130,28 @@ int rpc_filesize(unifyfs_fops_ctx_t* ctx, return unifyfs_invoke_filesize_rpc(gfid, filesize); } +static +int rpc_transfer(unifyfs_fops_ctx_t* ctx, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file) +{ + if (TRANSFER_MODE_OWNER == transfer_mode) { + return unifyfs_invoke_transfer_rpc(ctx->app_id, ctx->client_id, + transfer_id, gfid, transfer_mode, + dest_file); + } else if (TRANSFER_MODE_LOCAL == transfer_mode) { + return unifyfs_invoke_broadcast_transfer(ctx->app_id, ctx->client_id, + transfer_id, gfid, + transfer_mode, dest_file); + } else { + LOGERR("invalid transfer mode=%d"); + return EINVAL; + } + +} + static int rpc_truncate(unifyfs_fops_ctx_t* ctx, int gfid, @@ -363,17 +363,18 @@ int rpc_mread(unifyfs_fops_ctx_t* ctx, } static struct unifyfs_fops _fops_rpc = { - .name = "rpc", - .init = rpc_init, - .metaget = rpc_metaget, - .metaset = rpc_metaset, - .fsync = rpc_fsync, + .name = "rpc", + .init = rpc_init, .filesize = rpc_filesize, - .truncate = rpc_truncate, + .fsync = rpc_fsync, .laminate = rpc_laminate, - .unlink = rpc_unlink, - .read = rpc_read, - .mread = rpc_mread, + .metaget = rpc_metaget, + .metaset = rpc_metaset, + .mread = rpc_mread, + .read = rpc_read, + .transfer = rpc_transfer, + .truncate = rpc_truncate, + .unlink = rpc_unlink }; struct unifyfs_fops* unifyfs_fops_impl = &_fops_rpc; diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index f0f023d4c..60efc9525 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -47,6 +47,7 @@ // common headers #include "arraylist.h" #include "tree.h" +#include "unifyfs_client.h" #include "unifyfs_const.h" #include "unifyfs_log.h" #include "unifyfs_logio.h" @@ -129,25 +130,17 @@ typedef struct { // forward declaration of reqmgr_thrd struct reqmgr_thrd; + /** * Structure to maintain application client state, including * logio and shared memory contexts, margo rpc address, etc. */ typedef struct app_client { - int app_id; /* index of app in server app_configs array */ - int client_id; /* this client's index in app's clients array */ - int dbg_rank; /* client debug rank - NOT CURRENTLY USED */ - int connected; /* is client currently connected? */ + unifyfs_client_state state; hg_addr_t margo_addr; /* client Margo address */ struct reqmgr_thrd* reqmgr; /* this client's request manager thread */ - - logio_context* logio; /* logio context for write data */ - - shm_context* shmem_super; /* shmem context for superblock region */ - size_t super_meta_offset; /* superblock offset to index metadata */ - size_t super_meta_size; /* size of index metadata region in bytes */ } app_client; /** diff --git a/server/src/unifyfs_group_rpc.c b/server/src/unifyfs_group_rpc.c index fdc72f98e..6dc5340d6 100644 --- a/server/src/unifyfs_group_rpc.c +++ b/server/src/unifyfs_group_rpc.c @@ -254,6 +254,11 @@ void collective_set_local_retval(coll_request* coll_req, int val) lbo->ret = val; break; } + case UNIFYFS_SERVER_BCAST_RPC_TRANSFER: { + transfer_bcast_out_t* tbo = (transfer_bcast_out_t*) output; + tbo->ret = val; + break; + } case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: { truncate_bcast_out_t* tbo = (truncate_bcast_out_t*) output; tbo->ret = val; @@ -315,6 +320,15 @@ static int coll_get_child_response(coll_request* coll_req, } break; } + case UNIFYFS_SERVER_BCAST_RPC_TRANSFER: { + transfer_bcast_out_t* ctbo = (transfer_bcast_out_t*) out; + transfer_bcast_out_t* tbo = (transfer_bcast_out_t*) output; + child_ret = ctbo->ret; + if (child_ret != UNIFYFS_SUCCESS) { + tbo->ret = child_ret; + } + break; + } case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: { truncate_bcast_out_t* ctbo = (truncate_bcast_out_t*) out; truncate_bcast_out_t* tbo = (truncate_bcast_out_t*) output; @@ -636,7 +650,7 @@ int unifyfs_invoke_broadcast_extents_rpc(int gfid) * Broadcast file attributes and extents metadata due to laminate *************************************************************************/ -/* file extents metadata broadcast rpc handler */ +/* file lamination broadcast rpc handler */ static void laminate_bcast_rpc(hg_handle_t handle) { LOGDBG("BCAST_RPC: laminate handler"); @@ -797,6 +811,140 @@ int unifyfs_invoke_broadcast_laminate(int gfid) } +/************************************************************************* + * Broadcast file transfer request + *************************************************************************/ + +/* file transfer broadcast rpc handler */ +static void transfer_bcast_rpc(hg_handle_t handle) +{ + LOGDBG("BCAST_RPC: transfer handler"); + + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + + coll_request* coll = NULL; + server_rpc_req_t* req = calloc(1, sizeof(*req)); + transfer_bcast_in_t* in = calloc(1, sizeof(*in)); + transfer_bcast_out_t* out = calloc(1, sizeof(*out)); + if ((NULL == req) || (NULL == in) || (NULL == out)) { + ret = ENOMEM; + } else { + /* get input params */ + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.transfer_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_TRANSFER; + coll = collective_create(rpc, handle, op_hgid, (int)(in->root), + (void*)in, (void*)out, sizeof(*out), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + req->req_type = rpc; + req->coll = coll; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit coll request to svcmgr"); + } + } + } + } + } + + if (ret != UNIFYFS_SUCCESS) { + /* report failure back to caller */ + transfer_bcast_out_t tbo; + tbo.ret = (int32_t)ret; + hg_return_t hret = margo_respond(handle, &tbo); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + if (NULL != coll) { + collective_cleanup(coll); + } else { + margo_destroy(handle); + } + } +} +DEFINE_MARGO_RPC_HANDLER(transfer_bcast_rpc) + +/* Execute broadcast tree for attributes and extent metadata due to transfer */ +int unifyfs_invoke_broadcast_transfer(int client_app, + int client_id, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file) +{ + /* assuming success */ + int ret = UNIFYFS_SUCCESS; + + /* get attributes and extents metadata */ + unifyfs_file_attr_t attrs; + ret = unifyfs_inode_metaget(gfid, &attrs); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to get file attributes for gfid=%d", gfid); + return ret; + } + + if (!attrs.is_shared) { + /* no need to broadcast for private files */ + LOGDBG("gfid=%d is private, not broadcasting", gfid); + return UNIFYFS_SUCCESS; + } + + ret = sm_transfer(glb_pmi_rank, client_app, client_id, transfer_id, gfid, + transfer_mode, dest_file, NULL); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("sm_transfer() at root failed for gfid=%d", gfid); + return ret; + } + + LOGDBG("BCAST_RPC: starting transfer(mode=%d) for gfid=%d to file %s", + transfer_mode, gfid, dest_file); + + coll_request* coll = NULL; + transfer_bcast_in_t* in = calloc(1, sizeof(*in)); + if (NULL == in) { + ret = ENOMEM; + } else { + /* set input params */ + in->root = (int32_t) glb_pmi_rank; + in->gfid = (int32_t) gfid; + in->mode = (int32_t) transfer_mode; + in->dst_file = (hg_const_string_t) dest_file; + + hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.transfer_bcast_id; + server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_TRANSFER; + coll = collective_create(rpc, HG_HANDLE_NULL, op_hgid, + glb_pmi_rank, (void*)in, + NULL, sizeof(transfer_bcast_out_t), + HG_BULK_NULL, HG_BULK_NULL, NULL); + if (NULL == coll) { + ret = ENOMEM; + } else { + ret = collective_forward(coll); + if (ret == UNIFYFS_SUCCESS) { + ret = invoke_bcast_progress_rpc(coll); + } + } + } + + return ret; +} + + /************************************************************************* * Broadcast file truncation *************************************************************************/ diff --git a/server/src/unifyfs_group_rpc.h b/server/src/unifyfs_group_rpc.h index fac96912b..9c23c183f 100644 --- a/server/src/unifyfs_group_rpc.h +++ b/server/src/unifyfs_group_rpc.h @@ -85,6 +85,20 @@ int unifyfs_invoke_broadcast_fileattr(int gfid, */ int unifyfs_invoke_broadcast_laminate(int gfid); +/** + * @brief Broadcast request to transfer file to all servers + * + * @param gfid target file + * + * @return success|failure + */ +int unifyfs_invoke_broadcast_transfer(int client_app, + int client_id, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file); + /** * @brief Truncate target file at all servers * diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c index a49aae931..cd3141e9a 100644 --- a/server/src/unifyfs_inode.c +++ b/server/src/unifyfs_inode.c @@ -542,7 +542,7 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, for (i = 0; i < n_extents; i++) { chunk_read_req_t* ext_chunks = resolved[i]; for (j = 0; j < n_resolved[i]; j++) { - /* debug_print_chunk_read_req(ext_chunks + j); */ + //debug_print_chunk_read_req(ext_chunks + j); *pos = ext_chunks[j]; pos++; } diff --git a/server/src/unifyfs_metadata_mdhim.c b/server/src/unifyfs_metadata_mdhim.c index 5ae6635a6..bda3a9da1 100644 --- a/server/src/unifyfs_metadata_mdhim.c +++ b/server/src/unifyfs_metadata_mdhim.c @@ -42,6 +42,9 @@ #include "indexes.h" #include "mdhim.h" +#define UNIFYFS_META_DB_NAME unifyfs_db +#define UNIFYFS_META_DB_PATH RUNDIR + struct mdhim_t* md; /* we use two MDHIM indexes: @@ -115,7 +118,7 @@ int meta_init_store(unifyfs_cfg_t* cfg) return -1; } mdhim_options_set_db_type(db_opts, LEVELDB); - mdhim_options_set_db_name(db_opts, cfg->meta_db_name); + mdhim_options_set_db_name(db_opts, UNIFYFS_META_DB_NAME); mdhim_options_set_key_type(db_opts, MDHIM_UNIFYFS_KEY); mdhim_options_set_debug_level(db_opts, MLOG_CRIT); @@ -133,12 +136,7 @@ int meta_init_store(unifyfs_cfg_t* cfg) /* number of metadata servers = * number of unifyfs servers / UNIFYFS_META_SERVER_RATIO */ - svr_ratio = 0; - rc = configurator_int_val(cfg->meta_server_ratio, &svr_ratio); - if (rc != 0) { - return -1; - } - ratio = (int) svr_ratio; + ratio = (int) UNIFYFS_META_SERVER_RATIO; mdhim_options_set_server_factor(db_opts, ratio); /* indices/attributes are striped to servers according diff --git a/server/src/unifyfs_metadata_mdhim.h b/server/src/unifyfs_metadata_mdhim.h index 94ef193c7..e01f6ddca 100644 --- a/server/src/unifyfs_metadata_mdhim.h +++ b/server/src/unifyfs_metadata_mdhim.h @@ -34,6 +34,19 @@ #include "unifyfs_log.h" #include "unifyfs_meta.h" +/* number of metadata servers = + * number of unifyfs servers / UNIFYFS_META_SERVER_RATIO */ +#define UNIFYFS_META_SERVER_RATIO 1 + +/* max count of remote read requests (per-server) */ +#define UNIFYFS_MAX_META_PER_SEND (4 * KIB) + +/* max count of metadata slices for a single data extent */ +#define UNIFYFS_MAX_META_SPLIT_COUNT (4 * KIB) + +/* NOTE: The maximum size of an individual read operation is + * (UNIFYFS_MAX_META_SPLIT_COUNT * UNIFYFS_META_DEFAULT_SLICE_SZ) */ + /* Key for file attributes */ typedef int fattr_key_t; diff --git a/server/src/unifyfs_p2p_rpc.c b/server/src/unifyfs_p2p_rpc.c index ffe9c6a71..587798f83 100644 --- a/server/src/unifyfs_p2p_rpc.c +++ b/server/src/unifyfs_p2p_rpc.c @@ -1219,6 +1219,123 @@ static void laminate_rpc(hg_handle_t handle) DEFINE_MARGO_RPC_HANDLER(laminate_rpc) +/************************************************************************* + * File transfer request + *************************************************************************/ + +/* Transfer the target file */ +int unifyfs_invoke_transfer_rpc(int client_app, + int client_id, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file) +{ + int owner_rank = hash_gfid_to_server(gfid); + if (owner_rank == glb_pmi_rank) { + return sm_transfer(glb_pmi_rank, client_app, client_id, transfer_id, + gfid, transfer_mode, dest_file, NULL); + } + + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.transfer_id; + int rc = get_p2p_request_handle(req_hgid, owner_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* fill rpc input struct and forward request */ + transfer_in_t in; + in.src_rank = (int32_t) glb_pmi_rank; + in.client_app = (int32_t) client_app; + in.client_id = (int32_t) client_id; + in.transfer_id = (int32_t) transfer_id; + in.gfid = (int32_t) gfid; + in.mode = (int32_t) transfer_mode; + in.dst_file = (hg_const_string_t) dest_file; + rc = forward_p2p_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* wait for request completion */ + rc = wait_for_p2p_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + + /* get the output of the rpc */ + int ret; + transfer_out_t out; + hg_return_t hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + margo_free_output(preq.handle, &out); + } + margo_destroy(preq.handle); + + return ret; +} + +/* Transfer rpc handler */ +static void transfer_rpc(hg_handle_t handle) +{ + LOGDBG("transfer rpc handler"); + + int ret = UNIFYFS_SUCCESS; + + /* get input params */ + transfer_in_t* in = malloc(sizeof(*in)); + server_rpc_req_t* req = malloc(sizeof(*req)); + if ((NULL == in) || (NULL == req)) { + ret = ENOMEM; + } else { + hg_return_t hret = margo_get_input(handle, in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + req->req_type = UNIFYFS_SERVER_RPC_TRANSFER; + req->handle = handle; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + margo_free_input(handle, in); + } + } + } + + /* if we hit an error during request submission, respond with the error */ + if (ret != UNIFYFS_SUCCESS) { + if (NULL != in) { + free(in); + } + if (NULL != req) { + free(req); + } + + /* return to caller */ + transfer_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); + } +} +DEFINE_MARGO_RPC_HANDLER(transfer_rpc) + + /************************************************************************* * File truncation request *************************************************************************/ diff --git a/server/src/unifyfs_p2p_rpc.h b/server/src/unifyfs_p2p_rpc.h index 70af349c6..d8e73d8ec 100644 --- a/server/src/unifyfs_p2p_rpc.h +++ b/server/src/unifyfs_p2p_rpc.h @@ -143,6 +143,25 @@ int unifyfs_invoke_metaget_rpc(int gfid, int unifyfs_invoke_metaset_rpc(int gfid, int attr_op, unifyfs_file_attr_t* attrs); +/** + * @brief Transfer target file + * + * @param client_app requesting client app id + * @param client_id requesting client id + * @param transfer_id requesting client transfer id + * @param gfid target file + * @param transfer_mode transfer mode + * @param dest_file destination file + * + * @return success|failure + */ +int unifyfs_invoke_transfer_rpc(int client_app, + int client_id, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file); + /** * @brief Truncate target file * diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index 6c2b71db5..a0b363039 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -40,7 +40,6 @@ #include "margo_server.h" #include "unifyfs_group_rpc.h" #include "unifyfs_p2p_rpc.h" - #include "unifyfs_server_rpcs.h" @@ -148,7 +147,6 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) thrd_ctrl->exit_flag = 0; thrd_ctrl->exited = 0; thrd_ctrl->waiting_for_work = 0; - thrd_ctrl->has_waiting_dispatcher = 0; /* launch request manager thread */ rc = pthread_create(&(thrd_ctrl->thrd), NULL, @@ -178,13 +176,13 @@ server_read_req_t* rm_reserve_read_req(reqmgr_thrd_t* thrd_ctrl) { server_read_req_t* rdreq = NULL; RM_REQ_LOCK(thrd_ctrl); - if (thrd_ctrl->num_read_reqs < RM_MAX_SERVER_READS) { - if (thrd_ctrl->next_rdreq_ndx < (RM_MAX_SERVER_READS - 1)) { + if (thrd_ctrl->num_read_reqs < UNIFYFS_SERVER_MAX_READS) { + if (thrd_ctrl->next_rdreq_ndx < (UNIFYFS_SERVER_MAX_READS - 1)) { rdreq = thrd_ctrl->read_reqs + thrd_ctrl->next_rdreq_ndx; assert((rdreq->req_ndx == 0) && (rdreq->in_use == 0)); rdreq->req_ndx = thrd_ctrl->next_rdreq_ndx++; } else { // search for unused slot - for (int i = 0; i < RM_MAX_SERVER_READS; i++) { + for (int i = 0; i < UNIFYFS_SERVER_MAX_READS; i++) { rdreq = thrd_ctrl->read_reqs + i; if ((rdreq->req_ndx == 0) && (rdreq->in_use == 0)) { rdreq->req_ndx = i; @@ -260,7 +258,7 @@ static void signal_new_responses(reqmgr_thrd_t* reqmgr) /* wake up the request manager thread */ RM_LOCK(reqmgr); if (reqmgr->waiting_for_work) { - /* have a reqmgr thread waiting on condition variable, + /* reqmgr thread is waiting on condition variable, * signal it to begin processing the responses we just added */ LOGDBG("signaling new responses"); pthread_cond_signal(&reqmgr->thrd_cond); @@ -455,8 +453,7 @@ int rm_request_exit(reqmgr_thrd_t* thrd_ctrl) /* inform reqmgr thread that it's time to exit */ thrd_ctrl->exit_flag = 1; - /* if reqmgr thread is not waiting in critical - * section, let's wait on it to come back */ + /* if reqmgr thread is waiting for work, wake it up */ if (thrd_ctrl->waiting_for_work) { /* signal reqmgr thread */ pthread_cond_signal(&thrd_ctrl->thrd_cond); @@ -491,7 +488,7 @@ static int rm_request_remote_chunks(reqmgr_thrd_t* thrd_ctrl) /* iterate over each active read request */ RM_REQ_LOCK(thrd_ctrl); - for (i = 0; i < RM_MAX_SERVER_READS; i++) { + for (i = 0; i < UNIFYFS_SERVER_MAX_READS; i++) { server_read_req_t* req = thrd_ctrl->read_reqs + i; if (!req->in_use) { continue; @@ -549,7 +546,7 @@ static int rm_process_remote_chunk_responses(reqmgr_thrd_t* thrd_ctrl) int ret = (int)UNIFYFS_SUCCESS; /* iterate over each active read request */ - for (i = 0; i < RM_MAX_SERVER_READS; i++) { + for (i = 0; i < UNIFYFS_SERVER_MAX_READS; i++) { server_read_req_t* req = thrd_ctrl->read_reqs + i; if (!req->in_use) { continue; @@ -668,7 +665,7 @@ int send_data_to_client(server_read_req_t* rdreq, } size_t data_size = (size_t) resp->read_rc; - size_t send_sz = MAX_DATA_TX_SIZE; + size_t send_sz = UNIFYFS_SERVER_MAX_DATA_TX_SIZE; char* bufpos = data; size_t resp_file_offset = resp->offset; @@ -1114,6 +1111,45 @@ static int process_read_rpc(reqmgr_thrd_t* reqmgr, return ret; } +static int process_transfer_rpc(reqmgr_thrd_t* reqmgr, + client_rpc_req_t* req) +{ + int ret = UNIFYFS_SUCCESS; + + unifyfs_transfer_in_t* in = req->input; + assert(in != NULL); + int transfer_id = in->transfer_id; + int gfid = in->gfid; + int mode = in->mode; + const char* dest_file = strdup(in->dst_file); + margo_free_input(req->handle, in); + free(in); + + LOGDBG("transferring gfid=%d to file %s", gfid, dest_file); + + unifyfs_fops_ctx_t ctx = { + .app_id = reqmgr->app_id, + .client_id = reqmgr->client_id, + }; + ret = unifyfs_fops_transfer(&ctx, transfer_id, gfid, mode, dest_file); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("unifyfs_fops_transfer() failed"); + } + + /* send rpc response */ + unifyfs_transfer_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + static int process_truncate_rpc(reqmgr_thrd_t* reqmgr, client_rpc_req_t* req) { @@ -1244,6 +1280,9 @@ static int rm_process_client_requests(reqmgr_thrd_t* reqmgr) case UNIFYFS_CLIENT_RPC_SYNC: rret = process_fsync_rpc(reqmgr, req); break; + case UNIFYFS_CLIENT_RPC_TRANSFER: + rret = process_transfer_rpc(reqmgr, req); + break; case UNIFYFS_CLIENT_RPC_TRUNCATE: rret = process_truncate_rpc(reqmgr, req); break; diff --git a/server/src/unifyfs_request_manager.h b/server/src/unifyfs_request_manager.h index e568f02f0..cc82caa6d 100644 --- a/server/src/unifyfs_request_manager.h +++ b/server/src/unifyfs_request_manager.h @@ -75,9 +75,6 @@ typedef struct reqmgr_thrd { /* flag indicating request manager thread is waiting on thrd_cond CV */ int waiting_for_work; - /* flag indicating a margo rpc handler ULT is waiting on thrd_cond CV */ - int has_waiting_dispatcher; - /* argobots mutex for synchronizing access to request state between * margo rpc handler ULTs and request manager thread */ ABT_mutex reqs_sync; @@ -85,7 +82,7 @@ typedef struct reqmgr_thrd { /* array of server read requests */ int num_read_reqs; int next_rdreq_ndx; - server_read_req_t read_reqs[RM_MAX_SERVER_READS]; + server_read_req_t read_reqs[UNIFYFS_SERVER_MAX_READS]; /* list of client rpc requests */ arraylist_t* client_reqs; diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index 50ed9115e..daede2f54 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -59,8 +59,8 @@ server_info_t* glb_servers; // array of server_info_t unifyfs_cfg_t server_cfg; static ABT_mutex app_configs_abt_sync; -static app_config* app_configs[MAX_NUM_APPS]; /* list of apps */ -static size_t clients_per_app = MAX_APP_CLIENTS; +static app_config* app_configs[UNIFYFS_SERVER_MAX_NUM_APPS]; /* list of apps */ +static size_t clients_per_app = UNIFYFS_SERVER_MAX_APP_CLIENTS; static int unifyfs_exit(void); @@ -594,7 +594,7 @@ static int unifyfs_exit(void) /* iterate over each active application and free resources */ LOGDBG("cleaning application state"); ABT_mutex_lock(app_configs_abt_sync); - for (int i = 0; i < MAX_NUM_APPS; i++) { + for (int i = 0; i < UNIFYFS_SERVER_MAX_NUM_APPS; i++) { /* get pointer to app config for this app_id */ app_config* app = app_configs[i]; if (NULL != app) { @@ -639,7 +639,7 @@ static int unifyfs_exit(void) app_config* get_application(int app_id) { ABT_mutex_lock(app_configs_abt_sync); - for (int i = 0; i < MAX_NUM_APPS; i++) { + for (int i = 0; i < UNIFYFS_SERVER_MAX_NUM_APPS; i++) { app_config* app_cfg = app_configs[i]; if ((NULL != app_cfg) && (app_cfg->app_id == app_id)) { ABT_mutex_unlock(app_configs_abt_sync); @@ -672,7 +672,7 @@ app_config* new_application(int app_id, new_app->app_id = app_id; /* insert the given app_config in an empty slot */ - for (int i = 0; i < MAX_NUM_APPS; i++) { + for (int i = 0; i < UNIFYFS_SERVER_MAX_NUM_APPS; i++) { app_config* existing = app_configs[i]; if (NULL == existing) { new_app->clients = (app_client**) calloc(clients_per_app, @@ -772,8 +772,8 @@ static unifyfs_rc attach_to_client_shmem(app_client* client, return EINVAL; } - int app_id = client->app_id; - int client_id = client->client_id; + int app_id = client->state.app_id; + int client_id = client->state.client_id; /* initialize shmem region for client's superblock */ sprintf(shm_name, SHMEM_SUPER_FMTSTR, app_id, client_id); @@ -782,7 +782,7 @@ static unifyfs_rc attach_to_client_shmem(app_client* client, LOGERR("Failed to attach to shmem superblock region %s", shm_name); return UNIFYFS_ERROR_SHMEM; } - client->shmem_super = shm_ctx; + client->state.shm_super_ctx = shm_ctx; return UNIFYFS_SUCCESS; } @@ -815,9 +815,9 @@ app_client* new_app_client(app_config* app, app_client* client = (app_client*) calloc(1, sizeof(app_client)); if (NULL != client) { int failure = 0; - client->app_id = app_id; - client->client_id = client_id; - client->dbg_rank = debug_rank; + client->state.app_id = app_id; + client->state.client_id = client_id; + client->state.app_rank = debug_rank; /* convert client_addr_str to margo hg_addr_t */ hg_return_t hret = margo_addr_lookup(unifyfsd_rpc_context->shm_mid, @@ -867,8 +867,8 @@ unifyfs_rc attach_app_client(app_client* client, return EINVAL; } - int app_id = client->app_id; - int client_id = client->client_id; + int app_id = client->state.app_id; + int client_id = client->state.client_id; int failure = 0; /* initialize server-side logio for this client */ @@ -876,7 +876,7 @@ unifyfs_rc attach_app_client(app_client* client, logio_shmem_size, logio_spill_size, logio_spill_dir, - &(client->logio)); + &(client->state.logio_ctx)); if (rc != UNIFYFS_SUCCESS) { failure = 1; } @@ -892,9 +892,16 @@ unifyfs_rc attach_app_client(app_client* client, return UNIFYFS_FAILURE; } - client->super_meta_offset = super_meta_offset; - client->super_meta_size = super_meta_size; - client->connected = 1; + client->state.write_index.index_offset = super_meta_offset; + client->state.write_index.index_size = super_meta_size; + + char* super_ptr = (char*)(client->state.shm_super_ctx->addr); + char* index_ptr = super_ptr + super_meta_offset; + client->state.write_index.ptr_num_entries = (size_t*) index_ptr; + index_ptr += get_page_size(); + client->state.write_index.index_entries = (unifyfs_index_t*) index_ptr; + + client->state.initialized = 1; return UNIFYFS_SUCCESS; } @@ -909,12 +916,12 @@ unifyfs_rc disconnect_app_client(app_client* client) return EINVAL; } - if (!client->connected) { + if (!client->state.initialized) { /* already done */ return UNIFYFS_SUCCESS; } - client->connected = 0; + client->state.initialized = 0; /* stop client request manager thread */ if (NULL != client->reqmgr) { @@ -926,12 +933,12 @@ unifyfs_rc disconnect_app_client(app_client* client) client->margo_addr); /* release client shared memory regions */ - if (NULL != client->shmem_super) { + if (NULL != client->state.shm_super_ctx) { /* Release superblock shared memory region. * Server is responsible for deleting superblock shared * memory file that was created by the client. */ - unifyfs_shm_unlink(client->shmem_super); - unifyfs_shm_free(&(client->shmem_super)); + unifyfs_shm_unlink(client->state.shm_super_ctx); + unifyfs_shm_free(&(client->state.shm_super_ctx)); } return UNIFYFS_SUCCESS; @@ -954,18 +961,18 @@ unifyfs_rc cleanup_app_client(app_config* app, app_client* client) } LOGDBG("cleaning application client %d:%d", - client->app_id, client->client_id); + client->state.app_id, client->state.client_id); disconnect_app_client(client); /* close client logio context */ - if (NULL != client->logio) { - unifyfs_logio_close(client->logio, 1); - client->logio = NULL; + if (NULL != client->state.logio_ctx) { + unifyfs_logio_close(client->state.logio_ctx, 1); + client->state.logio_ctx = NULL; } /* reset app->clients array index if set */ - int client_ndx = client->client_id - 1; /* client ids start at 1 */ + int client_ndx = client->state.client_id - 1; /* client ids start at 1 */ if (client == app->clients[client_ndx]) { app->clients[client_ndx] = NULL; } diff --git a/server/src/unifyfs_server_pid.c b/server/src/unifyfs_server_pid.c index df71622d5..99c1bcf96 100644 --- a/server/src/unifyfs_server_pid.c +++ b/server/src/unifyfs_server_pid.c @@ -80,7 +80,7 @@ static int create_server_pid_file(void) } snprintf(filename, sizeof(filename), "%s/%s", - server_cfg.sharedfs_dir, UNIFYFSD_PID_FILENAME); + server_cfg.sharedfs_dir, UNIFYFS_SERVER_PID_FILENAME); fp = fopen(filename, "w"); if (!fp) { diff --git a/server/src/unifyfs_service_manager.c b/server/src/unifyfs_service_manager.c index 53961c308..b16840247 100644 --- a/server/src/unifyfs_service_manager.c +++ b/server/src/unifyfs_service_manager.c @@ -33,6 +33,7 @@ #include "unifyfs_request_manager.h" #include "unifyfs_service_manager.h" #include "unifyfs_server_rpcs.h" +#include "unifyfs_transfer.h" #include "margo_server.h" /* Service Manager (SM) state */ @@ -60,6 +61,10 @@ typedef struct { /* list of chunk read requests from remote servers */ arraylist_t* chunk_reads; + /* list of local transfer requests */ + arraylist_t* local_transfers; + arraylist_t* completed_transfers; + /* list of service requests (server_rpc_req_t*) */ arraylist_t* svc_reqs; @@ -98,11 +103,70 @@ do { \ } \ } while (0) + +static inline void signal_svcmgr(void) +{ + pid_t this_thread = unifyfs_gettid(); + if (this_thread != sm->tid) { + /* signal svcmgr to begin processing the requests we just added */ + LOGDBG("signaling new service requests"); + pthread_cond_signal(&(sm->thrd_cond)); + } +} + +/* submit a request to the service manager thread */ +int sm_submit_service_request(server_rpc_req_t* req) +{ + if ((NULL == sm) || (NULL == sm->svc_reqs)) { + return UNIFYFS_FAILURE; + } + + SM_REQ_LOCK(); + arraylist_add(sm->svc_reqs, req); + SM_REQ_UNLOCK(); + + signal_svcmgr(); + + return UNIFYFS_SUCCESS; +} + +/* submit a transfer request to the service manager thread */ +int sm_submit_transfer_request(transfer_thread_args* tta) +{ + if ((NULL == sm) || (NULL == sm->local_transfers)) { + return UNIFYFS_FAILURE; + } + + SM_REQ_LOCK(); + arraylist_add(sm->local_transfers, tta); + SM_REQ_UNLOCK(); + + signal_svcmgr(); + + return UNIFYFS_SUCCESS; +} + +/* tell service manager thread transfer has completed */ +int sm_complete_transfer_request(transfer_thread_args* tta) +{ + if ((NULL == sm) || (NULL == sm->completed_transfers)) { + return UNIFYFS_FAILURE; + } + + SM_REQ_LOCK(); + arraylist_add(sm->completed_transfers, tta); + SM_REQ_UNLOCK(); + + signal_svcmgr(); + + return UNIFYFS_SUCCESS; +} + /* initialize and launch service manager thread */ int svcmgr_init(void) { - /* allocate a service manager struct, - * store in global variable */ + /* allocate a struct to maintain service manager state. + * store pointer to struct in a global variable */ sm = (svcmgr_state_t*)calloc(1, sizeof(svcmgr_state_t)); if (NULL == sm) { LOGERR("failed to allocate service manager state!"); @@ -110,7 +174,7 @@ int svcmgr_init(void) } /* initialize lock for shared data structures of the - * request manager */ + * service manager */ pthread_mutexattr_t attr; pthread_mutexattr_init(&attr); pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); @@ -143,6 +207,20 @@ int svcmgr_init(void) return ENOMEM; } + /* allocate lists to track local transfer requests */ + sm->local_transfers = arraylist_create(0); + if (sm->local_transfers == NULL) { + LOGERR("failed to allocate service manager local_transfers!"); + svcmgr_fini(); + return ENOMEM; + } + sm->completed_transfers = arraylist_create(0); + if (sm->completed_transfers == NULL) { + LOGERR("failed to allocate service manager completed_transfers!"); + svcmgr_fini(); + return ENOMEM; + } + /* allocate a list to track service requests */ sm->svc_reqs = arraylist_create(0); if (sm->svc_reqs == NULL) { @@ -158,7 +236,7 @@ int svcmgr_init(void) if (rc != 0) { LOGERR("failed to create service manager thread"); svcmgr_fini(); - return UNIFYFS_ERROR_THRDINIT; + return UNIFYFS_ERROR_THREAD; } return UNIFYFS_SUCCESS; @@ -183,6 +261,14 @@ int svcmgr_fini(void) arraylist_free(sm->chunk_reads); } + if (NULL != sm->local_transfers) { + arraylist_free(sm->local_transfers); + } + + if (NULL != sm->completed_transfers) { + arraylist_free(sm->completed_transfers); + } + if (NULL != sm->svc_reqs) { arraylist_free(sm->svc_reqs); } @@ -298,7 +384,7 @@ int sm_issue_chunk_reads(int src_rank, int cli_id = rreq->log_client_id; app_clnt = get_app_client(app_id, cli_id); if (NULL != app_clnt) { - logio_context* logio_ctx = app_clnt->logio; + logio_context* logio_ctx = app_clnt->state.logio_ctx; if (NULL != logio_ctx) { size_t nread = 0; int rc = unifyfs_logio_read(logio_ctx, log_offset, nbytes, @@ -459,6 +545,54 @@ int sm_find_extents(int gfid, return ret; } +int sm_transfer(int client_server, + int client_app, + int client_id, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file, + server_rpc_req_t* bcast_req) +{ + int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); + + unifyfs_file_attr_t attrs; + int ret = unifyfs_inode_metaget(gfid, &attrs); + if (ret == UNIFYFS_SUCCESS) { + /* we have local file state */ + LOGDBG("transfer - gfid=%d mode=%d file=%s", + gfid, transfer_mode, dest_file); + transfer_thread_args* tta = calloc(1, sizeof(*tta)); + if (transfer_mode == TRANSFER_MODE_LOCAL) { + /* each server transfers local data to the destination file */ + int rc = create_local_transfers(gfid, dest_file, tta); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + } else { + /* submit transfer request for processing */ + tta->bcast_req = bcast_req; + tta->client_server = client_server; + tta->client_app = client_app; + tta->client_id = client_id; + tta->transfer_id = transfer_id; + rc = sm_submit_transfer_request(tta); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + } + } + } else if (is_owner && (transfer_mode == TRANSFER_MODE_OWNER)) { + // TODO: support TRANSFER_MODE_OWNER + ret = UNIFYFS_ERROR_NYI; + } + if (ret != UNIFYFS_SUCCESS) { + LOGERR("transfer(gfid=%d, mode=%d, file=%s) failed", + gfid, transfer_mode, dest_file); + } + } + return ret; +} + int sm_truncate(int gfid, size_t filesize) { int owner_rank = hash_gfid_to_server(gfid); @@ -471,7 +605,7 @@ int sm_truncate(int gfid, size_t filesize) size_t old_size = (size_t) attrs.size; LOGDBG("truncate - gfid=%d size=%zu old-size=%zu", gfid, filesize, old_size); - int ret = unifyfs_inode_truncate(gfid, (unsigned long)filesize); + ret = unifyfs_inode_truncate(gfid, (unsigned long)filesize); if (ret != UNIFYFS_SUCCESS) { LOGERR("truncate(gfid=%d, size=%zu) failed", gfid, filesize); @@ -504,7 +638,7 @@ static int send_chunk_read_responses(void) * list on the service manager structure */ int num_chunk_reads = arraylist_size(sm->chunk_reads); if (num_chunk_reads) { - /* got some chunk read requets, take the list and replace + /* got some chunk read requests, take the list and replace * it with an empty list */ LOGDBG("processing %d chunk read responses", num_chunk_reads); chunk_reads = sm->chunk_reads; @@ -531,30 +665,106 @@ static int send_chunk_read_responses(void) return rc; } -static inline void signal_new_requests(void) +static int spawn_local_transfers(void) { - pid_t this_thread = unifyfs_gettid(); - if (this_thread != sm->tid) { - /* signal svcmgr to begin processing the requests we just added */ - LOGDBG("signaling new service requests"); - pthread_cond_signal(&(sm->thrd_cond)); + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + + /* this will hold a list of local transfers if we find any */ + arraylist_t* transfers = NULL; + + /* lock to access global service manager object */ + SM_REQ_LOCK(); + + /* if we have any local transfers, take pointer to the list + * of transfer args and replace it with a newly allocated + * list on the service manager structure */ + int num_transfers = arraylist_size(sm->local_transfers); + if (num_transfers) { + /* got some transfer requests, take the list and replace + * it with an empty list */ + LOGDBG("processing %d local transfers", num_transfers); + transfers = sm->local_transfers; + sm->local_transfers = arraylist_create(0); } + + /* release lock on service manager object */ + SM_REQ_UNLOCK(); + + /* iterate over each transfer and spawn helper thread */ + transfer_thread_args* tta; + for (int i = 0; i < num_transfers; i++) { + /* get next transfer */ + tta = (transfer_thread_args*) arraylist_remove(transfers, i); + + /* spawn transfer helper thread */ + int rc = pthread_create(&(tta->thrd), NULL, + transfer_helper_thread, (void*)tta); + if (rc != 0) { + LOGERR("failed to spawn transfer helper thread for tta=%p", tta); + ret = UNIFYFS_ERROR_THREAD; + release_transfer_thread_args(tta); + } + } + + return ret; } -/* submit a request to the service manager thread */ -int sm_submit_service_request(server_rpc_req_t* req) +static int complete_local_transfers(void) { - if ((NULL == sm) || (NULL == sm->svc_reqs)) { - return UNIFYFS_FAILURE; - } + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + /* this will hold a list of local transfers if we find any */ + arraylist_t* transfers = NULL; + + /* lock to access global service manager object */ SM_REQ_LOCK(); - arraylist_add(sm->svc_reqs, req); + + /* if we have any local transfers, take pointer to the list + * of transfer args and replace it with a newly allocated + * list on the service manager structure */ + int num_transfers = arraylist_size(sm->completed_transfers); + if (num_transfers) { + /* got some transfer requests, take the list and replace + * it with an empty list */ + LOGDBG("completing %d local transfers", num_transfers); + transfers = sm->completed_transfers; + sm->completed_transfers = arraylist_create(0); + } + + /* release lock on service manager object */ SM_REQ_UNLOCK(); - signal_new_requests(); + /* iterate over each transfer and spawn helper thread */ + transfer_thread_args* tta; + for (int i = 0; i < num_transfers; i++) { + /* get next transfer */ + tta = (transfer_thread_args*) arraylist_remove(transfers, i); + + /* spawn transfer helper thread */ + int rc = pthread_join(tta->thrd, NULL); + if (rc != 0) { + LOGERR("failed to join transfer helper thread for tta=%p", tta); + ret = UNIFYFS_ERROR_THREAD; + } + + if (glb_pmi_rank == tta->client_server) { + rc = invoke_client_transfer_complete_rpc(tta->client_app, + tta->client_id, + tta->transfer_id, + tta->status); + if (rc != 0) { + LOGERR("failed transfer(id=%d) complete rpc to client[%d:%d]", + tta->transfer_id, tta->client_app, tta->client_id); + ret = rc; + } + } - return UNIFYFS_SUCCESS; + release_transfer_thread_args(tta); + } + + return ret; } static int process_chunk_read_rpc(server_rpc_req_t* req) @@ -829,6 +1039,39 @@ static int process_server_pid_rpc(server_rpc_req_t* req) return ret; } +static int process_transfer_rpc(server_rpc_req_t* req) +{ + /* get target file and requested file size */ + transfer_in_t* in = req->input; + int src_rank = (int) in->src_rank; + int client_app = (int) in->client_app; + int client_id = (int) in->client_id; + int transfer_id = (int) in->transfer_id; + int gfid = (int) in->gfid; + int transfer_mode = (int) in->mode; + char* dest_file = strdup(in->dst_file); + margo_free_input(req->handle, in); + free(in); + + /* do file transfer */ + int ret = sm_transfer(src_rank, client_app, client_id, transfer_id, + gfid, transfer_mode, dest_file, NULL); + free(dest_file); + + /* send rpc response */ + transfer_out_t out; + out.ret = (int32_t) ret; + hg_return_t hret = margo_respond(req->handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* cleanup req */ + margo_destroy(req->handle); + + return ret; +} + static int process_truncate_rpc(server_rpc_req_t* req) { /* get target file and requested file size */ @@ -952,6 +1195,31 @@ static int process_laminate_bcast_rpc(server_rpc_req_t* req) return ret; } +static int process_transfer_bcast_rpc(server_rpc_req_t* req) +{ + /* get target file and requested file size */ + transfer_bcast_in_t* in = req->input; + int src_rank = (int) in->root; + int gfid = (int) in->gfid; + int transfer_mode = (int) in->mode; + const char* dest_file = (const char*) in->dst_file; + + LOGDBG("gfid=%d file=%s", gfid, dest_file); + + /* do file transfer */ + int ret = sm_transfer(src_rank, -1, -1, -1, gfid, transfer_mode, + dest_file, req); + if (UNIFYFS_SUCCESS != ret) { + /* submission of transfer request failed */ + collective_set_local_retval(req->coll, ret); + + /* create a ULT to finish broadcast operation */ + ret = invoke_bcast_progress_rpc(req->coll); + } + + return ret; +} + static int process_truncate_bcast_rpc(server_rpc_req_t* req) { /* get target file and requested file size */ @@ -1067,6 +1335,9 @@ static int process_service_requests(void) case UNIFYFS_SERVER_RPC_PID_REPORT: rret = process_server_pid_rpc(req); break; + case UNIFYFS_SERVER_RPC_TRANSFER: + rret = process_transfer_rpc(req); + break; case UNIFYFS_SERVER_RPC_TRUNCATE: rret = process_truncate_rpc(req); break; @@ -1079,6 +1350,9 @@ static int process_service_requests(void) case UNIFYFS_SERVER_BCAST_RPC_LAMINATE: rret = process_laminate_bcast_rpc(req); break; + case UNIFYFS_SERVER_BCAST_RPC_TRANSFER: + rret = process_transfer_bcast_rpc(req); + break; case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: rret = process_truncate_bcast_rpc(req); break; @@ -1157,6 +1431,11 @@ void* service_manager_thread(void* arg) LOGERR("failed to send chunk read responses"); } + rc = spawn_local_transfers(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to send chunk read responses"); + } + #if defined(USE_SVCMGR_PROGRESS_TIMER) if (have_progress_timer) { /* cancel progress alarm */ @@ -1191,6 +1470,11 @@ void* service_manager_thread(void* arg) sm->waiting_for_work = 0; SM_UNLOCK(); + rc = complete_local_transfers(); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to complete local transfers"); + } + if (sm->time_to_exit) { break; } diff --git a/server/src/unifyfs_service_manager.h b/server/src/unifyfs_service_manager.h index 05cc441fd..004c59353 100644 --- a/server/src/unifyfs_service_manager.h +++ b/server/src/unifyfs_service_manager.h @@ -31,15 +31,8 @@ #define UNIFYFS_SERVICE_MANAGER_H #include "unifyfs_global.h" +#include "unifyfs_transfer.h" -typedef struct { - server_rpc_e req_type; - hg_handle_t handle; - void* coll; - void* input; - void* bulk_buf; - size_t bulk_sz; -} server_rpc_req_t; /* service manager pthread routine */ void* service_manager_thread(void* ctx); @@ -59,6 +52,12 @@ int svcmgr_fini(void); */ int sm_submit_service_request(server_rpc_req_t* req); +/* submit a transfer request to the service manager thread */ +int sm_submit_transfer_request(transfer_thread_args* tta); + +/* tell service manager thread transfer has completed */ +int sm_complete_transfer_request(transfer_thread_args* tta); + /* decode and issue chunk reads contained in message buffer */ int sm_issue_chunk_reads(int src_rank, int src_app_id, @@ -89,7 +88,17 @@ int sm_find_extents(int gfid, unsigned int* out_num_chunks, chunk_read_req_t** out_chunks); +int sm_transfer(int client_server, + int client_app, + int client_id, + int transfer_id, + int gfid, + int transfer_mode, + const char* dest_file, + server_rpc_req_t* bcast_req); + int sm_truncate(int gfid, size_t filesize); + #endif // UNIFYFS_SERVICE_MANAGER_H diff --git a/server/src/unifyfs_transfer.c b/server/src/unifyfs_transfer.c new file mode 100644 index 000000000..c0b70c533 --- /dev/null +++ b/server/src/unifyfs_transfer.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + + +#include "unifyfs_inode.h" +#include "unifyfs_group_rpc.h" +#include "unifyfs_service_manager.h" +#include "unifyfs_transfer.h" +#include + +/* maximum length in bytes for pwrite() transfers */ +#ifndef UNIFYFS_TRANSFER_MAX_WRITE +#define UNIFYFS_TRANSFER_MAX_WRITE (16 * 1048576) // 16 MiB +#endif + +/* maximum memory allocation for temporary transfer data copies */ +#ifndef UNIFYFS_TRANSFER_MAX_BUFFER +#define UNIFYFS_TRANSFER_MAX_BUFFER (512 * 1048576) // 512 MiB +#endif + +typedef struct transfer_chunk { + char* chunk_data; + size_t chunk_sz; + off_t file_offset; +} transfer_chunk; + +/* write a transfer_chunk to given file descriptor */ +static int write_transfer_chunk(int fd, + transfer_chunk* chk) +{ + // TODO: use lio_listio to submit all writes at once? + size_t max_write = UNIFYFS_TRANSFER_MAX_WRITE; + size_t n_write = 0; + size_t n_remain = chk->chunk_sz; + do { + char* data = chk->chunk_data + n_write; + off_t off = chk->file_offset + (off_t)n_write; + size_t n_bytes = (n_remain > max_write ? max_write : n_remain); + ssize_t szrc = pwrite(fd, data, n_bytes, off); + if (-1 == szrc) { + int err = errno; + if ((err != EINTR) && (err != EAGAIN)) { + LOGERR("pwrite(dst_fd=%d, sz=%zu) failed: %s", + fd, n_remain, strerror(err)); + return err; + } + + } else { + n_write += szrc; + n_remain -= szrc; + } + } while (n_remain); + + return UNIFYFS_SUCCESS; +} + +static int read_local_extent(struct extent_tree_node* ext, + transfer_chunk* chk) +{ + int ret = UNIFYFS_SUCCESS; + + char* buf = chk->chunk_data; + chk->chunk_sz = extent_tree_node_length(ext); + chk->file_offset = extent_tree_node_offset(ext); + + /* read data from client log */ + app_client* app_clnt = NULL; + int app_id = ext->app_id; + int cli_id = ext->cli_id; + off_t log_offset = (off_t) ext->pos; + app_clnt = get_app_client(app_id, cli_id); + if (NULL != app_clnt) { + logio_context* logio_ctx = app_clnt->state.logio_ctx; + if (NULL != logio_ctx) { + size_t nread = 0; + int rc = unifyfs_logio_read(logio_ctx, log_offset, chk->chunk_sz, + buf, &nread); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + } + } else { + LOGERR("app client [%d:%d] has NULL logio context", + app_id, cli_id); + ret = EINVAL; + } + } else { + LOGERR("failed to get application client [%d:%d] state", + app_id, cli_id); + ret = EINVAL; + } + + return ret; +} + +/* find local extents for the given gfid and initialize transfer helper + * thread state */ +int create_local_transfers(int gfid, + const char* dest_file, + transfer_thread_args* tta) +{ + if ((NULL == dest_file) || (NULL == tta)) { + return EINVAL; + } + + size_t n_extents = 0; + struct extent_tree_node* extents = NULL; + int rc = unifyfs_inode_get_extents(gfid, &n_extents, &extents); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to get extents from inode for gfid=%d", gfid); + return rc; + } else if (n_extents == 0) { + return UNIFYFS_SUCCESS; + } + + /* determine local extents */ + struct extent_tree_node* ext; + size_t n_local_extents = 0; + size_t total_local_data_sz = 0; + for (size_t i = 0; i < n_extents; i++) { + ext = extents + i; + if (glb_pmi_rank == ext->svr_rank) { + total_local_data_sz += extent_tree_node_length(ext); + n_local_extents++; + } + } + + /* make an array of local extents */ + struct extent_tree_node* local_extents = (struct extent_tree_node*) + calloc(n_local_extents, sizeof(struct extent_tree_node)); + if (NULL == local_extents) { + LOGERR("failed to allocate local extents for gfid=%d", gfid); + free(extents); + return ENOMEM; + } + + struct extent_tree_node* dst_ext; + size_t ext_ndx = 0; + for (size_t i = 0; i < n_extents; i++) { + ext = extents + i; + if (glb_pmi_rank == ext->svr_rank) { + dst_ext = local_extents + ext_ndx; + ext_ndx++; + memcpy(dst_ext, ext, sizeof(*ext)); + } + } + + free(extents); + + tta->dst_file = strdup(dest_file); + tta->gfid = gfid; + tta->local_extents = local_extents; + tta->n_extents = n_local_extents; + tta->local_data_sz = total_local_data_sz; + + return UNIFYFS_SUCCESS; +} + +void release_transfer_thread_args(transfer_thread_args* tta) +{ + if (NULL != tta) { + if (NULL != tta->local_extents) { + free(tta->local_extents); + } + if (NULL != tta->dst_file) { + free((char*)(tta->dst_file)); + } + free(tta); + } +} + +void* transfer_helper_thread(void* arg) +{ + transfer_thread_args* tta = (transfer_thread_args*)arg; + assert(NULL != arg); + + int rc; + int ret = UNIFYFS_SUCCESS; + char* data_copy_buf = NULL; + transfer_chunk* chunks = NULL; + struct extent_tree_node* ext; + transfer_chunk* chk; + + LOGDBG("I am transfer thread for gfid=%d file=%s", + tta->gfid, tta->dst_file); + + /* open destination file (create if it doesn't exist) */ + int flags = O_CREAT | O_WRONLY; + int mode = 0640; + int fd = open(tta->dst_file, flags, mode); + if (fd == -1) { + int err = errno; + LOGERR("failed to open(%s) - %s", tta->dst_file, strerror(err)); + tta->status = err; + return arg; + } + + /* get number of local extents and their total size */ + size_t total_local_data_sz = tta->local_data_sz; + size_t n_extents = tta->n_extents; + + /* allocate transfer_chunk array */ + chunks = calloc(n_extents, sizeof(transfer_chunk)); + if (NULL == chunks) { + LOGERR("failed to allocate transfer chunk state"); + ret = ENOMEM; + goto transfer_cleanup; + } + + /* allocate copy buffer for chunk data */ + size_t max_buffer = UNIFYFS_TRANSFER_MAX_BUFFER; + size_t buf_sz = max_buffer; + if (total_local_data_sz <= max_buffer) { + buf_sz = total_local_data_sz; + } else { + /* make sure longest extent will fit in copy buffer */ + for (size_t i = 0; i < n_extents; i++) { + ext = tta->local_extents + i; + size_t ext_sz = extent_tree_node_length(ext); + if (ext_sz > buf_sz) { + buf_sz = ext_sz; + } + } + } + data_copy_buf = malloc(buf_sz); + if (NULL == data_copy_buf) { + LOGERR("failed to allocate transfer copy buffer"); + ret = ENOMEM; + goto transfer_cleanup; + } + + /* read local data for all extents and write it to corresponding + * offsets within destination file. */ + size_t ext_ndx = 0; /* tracks extent array index */ + size_t chk_ndx = 0; /* tracks chunk array index */ + do { + size_t begin_chk_ndx = chk_ndx; + size_t copy_sz = 0; + for (size_t i = ext_ndx; i < n_extents; i++) { + ext = tta->local_extents + i; + size_t ext_sz = extent_tree_node_length(ext); + if ((copy_sz + ext_sz) <= buf_sz) { + chk = chunks + chk_ndx; + chk_ndx++; + ext_ndx++; + + chk->chunk_data = data_copy_buf + copy_sz; + copy_sz += ext_sz; + + rc = read_local_extent(ext, chk); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to copy extent[%zu] data for gfid=%d", + i, tta->gfid); + ret = rc; + goto transfer_cleanup; + } + } else { + /* no room left in copy buffer */ + break; + } + } + + /* write out data chunks for extents processed in this iteration */ + for (size_t i = begin_chk_ndx; i < chk_ndx; i++) { + chk = chunks + i; + rc = write_transfer_chunk(fd, chk); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("write_transfer_chunk(dst=%s, chk=%zu) failed", + tta->dst_file, i); + ret = rc; + goto transfer_cleanup; + } + } + } while (ext_ndx < n_extents); + +transfer_cleanup: + + close(fd); + tta->status = ret; + + if (NULL != tta->bcast_req) { + /* create a ULT to finish broadcast operation */ + collective_set_local_retval(tta->bcast_req->coll, ret); + invoke_bcast_progress_rpc(tta->bcast_req->coll); + } + + LOGDBG("signaling transfer completion"); + + rc = sm_complete_transfer_request(tta); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("sm_complete_transfer_request() failed"); + } + + /* release allocated memory */ + if (NULL != data_copy_buf) { + free(data_copy_buf); + } + if (NULL != chunks) { + free(chunks); + } + + return arg; +} diff --git a/server/src/unifyfs_transfer.h b/server/src/unifyfs_transfer.h new file mode 100644 index 000000000..6dfca0c80 --- /dev/null +++ b/server/src/unifyfs_transfer.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#ifndef UNIFYFS_TRANSFER_H +#define UNIFYFS_TRANSFER_H + +#include "unifyfs_global.h" + +/* server transfer modes */ +typedef enum { + TRANSFER_MODE_OWNER = 0, /* owner transfers all data */ + TRANSFER_MODE_LOCAL = 1 /* each server transfers local data */ +} transfer_mode_e; + +/* transfer helper thread arguments structure */ +typedef struct transfer_thread_args { + const char* dst_file; /* destination file */ + int gfid; /* source file */ + + /* requesting client and transfer info */ + int client_server; /* rank of server where request originated */ + int client_app; /* app of originating client */ + int client_id; /* id of originating client */ + int transfer_id; /* transfer request id at originating client */ + + /* local extents to transfer to destination file */ + struct extent_tree_node* local_extents; + size_t n_extents; + + size_t local_data_sz; /* total size of local data */ + + server_rpc_req_t* bcast_req; /* bcast rpc req state */ + + int status; /* status for entire set of transfers */ + pthread_t thrd; /* pthread id for transfer helper thread */ +} transfer_thread_args; + +void release_transfer_thread_args(transfer_thread_args* tta); + +/* find local extents for the given gfid and initialize transfer helper + * thread state */ +int create_local_transfers(int gfid, + const char* dest_file, + transfer_thread_args* tta); + +/** + * transfer helper thread main + * @param arg pointer to transfer_thread_args struct + * + * @return pointer to transfer_thread_args struct + */ +void* transfer_helper_thread(void* arg); + +#endif /* UNIFYFS_TRANSFER_H */ diff --git a/t/0700-unifyfs-stage-full.t b/t/0700-unifyfs-stage-full.t index 8db4f25b7..c25ad0e72 100755 --- a/t/0700-unifyfs-stage-full.t +++ b/t/0700-unifyfs-stage-full.t @@ -8,7 +8,7 @@ test_description="Test basic functionality of unifyfs-stage executable" . $(dirname $0)/sharness.sh test_expect_success "unifyfs-stage exists" ' - test_path_is_file ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage + test_path_is_file ${UNIFYFS_BUILD_DIR}/util/unifyfs-stage/src/unifyfs-stage ' test_expect_success "testing temp dir exists" ' test_path_is_dir ${UNIFYFS_TEST_TMPDIR} @@ -58,7 +58,7 @@ test_expect_success "target directory is empty" ' stage_in_log=$stage_cfg_dir/stage_IN.log stage_out_log=$stage_cfg_dir/stage_OUT.log -stage_exe=${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage +stage_exe=${UNIFYFS_BUILD_DIR}/util/unifyfs-stage/src/unifyfs-stage $JOB_RUN_COMMAND $stage_exe -v -m ${UNIFYFS_TEST_MOUNT} $stage_in_manifest &> $stage_in_log diff --git a/t/8000-client-api.t b/t/8000-library-api.t similarity index 77% rename from t/8000-client-api.t rename to t/8000-library-api.t index c7c56bf6e..c3688722f 100755 --- a/t/8000-client-api.t +++ b/t/8000-library-api.t @@ -5,4 +5,4 @@ # . $(dirname $0)/sharness.d/00-test-env.sh . $(dirname $0)/sharness.d/01-unifyfs-settings.sh -$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/api/client_api_test.t +$JOB_RUN_COMMAND $UNIFYFS_BUILD_DIR/t/api/api_test.t diff --git a/t/9020-mountpoint-empty.t b/t/9020-mountpoint-empty.t index 82141dd9d..3ce599364 100755 --- a/t/9020-mountpoint-empty.t +++ b/t/9020-mountpoint-empty.t @@ -9,7 +9,7 @@ # way and the call is falling through to the operating system. test_description="Verify UnifyFS intercepted mount point is empty" -. $(dirname $0)/sharness.sh +. $(dirname $0)/sharness.sh -v test_expect_success "Intercepted mount point $UNIFYFS_MOUNTPOINT is empty" ' test_dir_is_empty $UNIFYFS_MOUNTPOINT diff --git a/t/9300-unifyfs-stage-isolated.t b/t/9300-unifyfs-stage-isolated.t index 81cac7d0b..92544f741 100755 --- a/t/9300-unifyfs-stage-isolated.t +++ b/t/9300-unifyfs-stage-isolated.t @@ -8,7 +8,7 @@ test_description="Test basic functionality of unifyfs-stage executable" . $(dirname $0)/sharness.sh test_expect_success "unifyfs-stage exists" ' - test_path_is_file ${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage + test_path_is_file ${UNIFYFS_BUILD_DIR}/util/unifyfs-stage/src/unifyfs-stage ' test_expect_success "testing temp dir exists" ' test_path_is_dir ${UNIFYFS_TEST_TMPDIR} @@ -36,7 +36,7 @@ test_expect_success "config_9300 directory is empty" ' # NOTE: we're using the unifyfs-stage binary as its own transfer data target # because we know it's there and it's filled with non-zero data. -stage_exe=${SHARNESS_BUILD_DIRECTORY}/util/unifyfs-stage/src/unifyfs-stage +stage_exe=${UNIFYFS_BUILD_DIR}/util/unifyfs-stage/src/unifyfs-stage cp $stage_exe $stage_src_file test_expect_success "source.file exists" ' diff --git a/t/9999-cleanup.t b/t/9999-cleanup.t index a72e1dd24..d8eacb105 100755 --- a/t/9999-cleanup.t +++ b/t/9999-cleanup.t @@ -2,7 +2,7 @@ test_description="Cleanup test environment" -. $(dirname $0)/sharness.sh +. $(dirname $0)/sharness.sh -v test_expect_success "Cleanup" ' unifyfsd_cleanup diff --git a/t/Makefile.am b/t/Makefile.am index fdf35aa0d..57dd2ac9c 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -18,7 +18,7 @@ TESTS += \ 0510-statfs-static.t \ 0600-stdio-static.t \ 0700-unifyfs-stage-full.t \ - 8000-client-api.t \ + 8000-library-api.t \ 9005-unifyfs-unmount.t \ 9010-stop-unifyfsd.t \ 9020-mountpoint-empty.t \ @@ -41,7 +41,7 @@ clean-local: rm -fr trash-directory.* test-results *.log test_run_env.sh libexec_PROGRAMS = \ - api/client_api_test.t \ + api/api_test.t \ common/seg_tree_test.t \ common/slotmap_test.t \ std/stdio-static.t \ @@ -80,7 +80,6 @@ test_api_ldadd = \ test_api_ldflags = \ $(AM_LDFLAGS) \ - $(CP_WRAPPERS) \ -static # flags for common tests @@ -107,16 +106,17 @@ test_wrap_ldflags = \ # Per-target flags begin here -api_client_api_test_t_CPPFLAGS = $(test_cppflags) -api_client_api_test_t_LDADD = $(test_api_ldadd) -api_client_api_test_t_LDFLAGS = $(test_api_ldflags) -api_client_api_test_t_SOURCES = \ - api/client_api_suite.h \ - api/client_api_suite.c \ +api_api_test_t_CPPFLAGS = $(test_cppflags) +api_api_test_t_LDADD = $(test_api_ldadd) +api_api_test_t_LDFLAGS = $(test_api_ldflags) +api_api_test_t_SOURCES = \ + api/api_suite.h \ + api/api_suite.c \ api/init-fini.c \ api/create-open-remove.c \ api/write-read-sync-stat.c \ - api/laminate.c + api/laminate.c \ + api/transfer.c test_sysio_sources = \ sys/sysio_suite.h \ diff --git a/t/api/client_api_suite.c b/t/api/api_suite.c similarity index 87% rename from t/api/client_api_suite.c rename to t/api/api_suite.c index ab65a318e..70a170ff3 100644 --- a/t/api/client_api_suite.c +++ b/t/api/api_suite.c @@ -13,9 +13,9 @@ */ //#include -#include "client_api_suite.h" +#include "api_suite.h" -/* This is the collection of client API tests. +/* This is the collection of library API tests. * * To add new subtests to existing API functionality tests: * 1. Simply add the tests (order matters) to the appropriate @@ -25,7 +25,7 @@ * 1. Create a t/api/.c source file with a function called: * api__test(char *unifyfs_root) * to contain all the TAP tests for that API functionality. - * 2. Add the function name to t/api/client_api_suite.h, with comments. + * 2. Add the function name to t/api/api_suite.h, with comments. * 3. In t/Makefile.am, add the new file to the source file list for * the api test suite (api_client_api_test_t_SOURCES). * 4. The api__test function can now be used in this suite. */ @@ -34,6 +34,7 @@ int main(int argc, char* argv[]) { int rc; char* unifyfs_root = testutil_get_mount_point(); + char* tmp_dir = testutil_get_tmp_dir(); unifyfs_handle fshdl; @@ -61,6 +62,9 @@ int main(int argc, char* argv[]) api_laminate_test(unifyfs_root, &fshdl); + api_transfer_test(unifyfs_root, tmp_dir, &fshdl, + (size_t)64 * MIB, (size_t)4 * MIB); + api_finalize_test(unifyfs_root, &fshdl); } diff --git a/t/api/client_api_suite.h b/t/api/api_suite.h similarity index 79% rename from t/api/client_api_suite.h rename to t/api/api_suite.h index b02f04ac7..731c24d9f 100644 --- a/t/api/client_api_suite.h +++ b/t/api/api_suite.h @@ -12,7 +12,7 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -/* This is the collection of client API tests. +/* This is the collection of library API tests. * * When new API functionality needs to be tested: * 1. Create a t/api/.c file with a function called: @@ -22,10 +22,10 @@ * 3. In t/Makefile.am, add the new file to the source file list for * the api test suite (api_client_api_test_t_SOURCES). * 4. The api__test function can now be called from the suite's - * implementation in t/api/client_api_suite.c */ + * implementation in t/api/api_suite.c */ -#ifndef T_CLIENT_API_SUITE_H -#define T_CLIENT_API_SUITE_H +#ifndef T_LIBRARY_API_SUITE_H +#define T_LIBRARY_API_SUITE_H #include "t/lib/tap.h" #include "t/lib/testutil.h" @@ -53,4 +53,11 @@ int api_write_read_sync_stat_test(char* unifyfs_root, int api_laminate_test(char* unifyfs_root, unifyfs_handle* fshdl); -#endif /* T_CLIENT_API_SUITE_H */ +/* Tests file transfers, both serial and parallel */ +int api_transfer_test(char* unifyfs_root, + char* tmpdir, + unifyfs_handle* fshdl, + size_t filesize, + size_t chksize); + +#endif /* T_LIBRARY_API_SUITE_H */ diff --git a/t/api/create-open-remove.c b/t/api/create-open-remove.c index 8d201a48f..4b2f69544 100644 --- a/t/api/create-open-remove.c +++ b/t/api/create-open-remove.c @@ -12,7 +12,8 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -#include "client_api_suite.h" +#include "api_suite.h" +#include int api_create_open_remove_test(char* unifyfs_root, unifyfs_handle* fshdl) @@ -48,15 +49,17 @@ int api_create_open_remove_test(char* unifyfs_root, diag("Starting API open tests"); + int rdwr_flags = O_RDWR; + int rd_flags = O_RDONLY; unifyfs_gfid t3_gfid = UNIFYFS_INVALID_GFID; unifyfs_gfid t4_gfid = UNIFYFS_INVALID_GFID; - rc = unifyfs_open(*fshdl, testfile1, &t3_gfid); + rc = unifyfs_open(*fshdl, rdwr_flags, testfile1, &t3_gfid); ok(rc == UNIFYFS_SUCCESS && t3_gfid == t1_gfid, "%s:%d unifyfs_open(%s) is successful: rc=%d (%s)", __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); - rc = unifyfs_open(*fshdl, testfile2, &t4_gfid); + rc = unifyfs_open(*fshdl, rd_flags, testfile2, &t4_gfid); ok(rc == UNIFYFS_SUCCESS && t4_gfid == t2_gfid, "%s:%d unifyfs_open(%s) is successful: rc=%d (%s)", __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); @@ -73,7 +76,7 @@ int api_create_open_remove_test(char* unifyfs_root, __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); if (UNIFYFS_SUCCESS == rc) { unifyfs_gfid t5_gfid = UNIFYFS_INVALID_GFID; - rc = unifyfs_open(*fshdl, testfile1, &t5_gfid); + rc = unifyfs_open(*fshdl, rd_flags, testfile1, &t5_gfid); ok(rc != UNIFYFS_SUCCESS && t5_gfid == UNIFYFS_INVALID_GFID, "%s:%d unifyfs_open(%s) after unifyfs_remove() fails: rc=%d (%s)", __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); @@ -85,7 +88,7 @@ int api_create_open_remove_test(char* unifyfs_root, __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); if (UNIFYFS_SUCCESS == rc) { unifyfs_gfid t6_gfid = UNIFYFS_INVALID_GFID; - rc = unifyfs_open(*fshdl, testfile1, &t6_gfid); + rc = unifyfs_open(*fshdl, rd_flags, testfile1, &t6_gfid); ok(rc != UNIFYFS_SUCCESS && t6_gfid == UNIFYFS_INVALID_GFID, "%s:%d unifyfs_open(%s) after unifyfs_remove() fails: rc=%d (%s)", __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); diff --git a/t/api/init-fini.c b/t/api/init-fini.c index ebb6e375a..ea4602559 100644 --- a/t/api/init-fini.c +++ b/t/api/init-fini.c @@ -12,7 +12,7 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -#include "client_api_suite.h" +#include "api_suite.h" int api_initialize_test(char* unifyfs_root, unifyfs_handle* fshdl) diff --git a/t/api/laminate.c b/t/api/laminate.c index 3b932f9ed..931065042 100644 --- a/t/api/laminate.c +++ b/t/api/laminate.c @@ -12,7 +12,7 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -#include "client_api_suite.h" +#include "api_suite.h" #include /* Tests file laminate, with subsequent write/read/stat */ diff --git a/t/api/transfer.c b/t/api/transfer.c new file mode 100644 index 000000000..70fbf3872 --- /dev/null +++ b/t/api/transfer.c @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include +#include + +#include "api_suite.h" + +int api_transfer_test(char* unifyfs_root, + char* tmpdir, + unifyfs_handle* fshdl, + size_t filesize, + size_t chksize) +{ + /* Create a random file names at the mountpoint path to test */ + char testfile1[64]; + char testfile2[64]; + char testfile3[64]; + testutil_rand_path(testfile1, sizeof(testfile1), unifyfs_root); + testutil_rand_path(testfile2, sizeof(testfile2), tmpdir); + testutil_rand_path(testfile3, sizeof(testfile3), tmpdir); + + //------------- + + int rc, err; + + size_t n_chks = filesize / chksize; + size_t extra = filesize % chksize; + if (extra) { + /* test only supports exact multiples of chunk size */ + filesize -= extra; + } + + char* databuf = malloc(filesize); + char* readbuf = malloc(filesize); + rc = (databuf == NULL) || (readbuf == NULL); + ok(rc == 0, + "%s:%d malloc() of two buffers with size=%zu is successful", + __FILE__, __LINE__, filesize); + if (rc) { + diag("Initial setup failed"); + return 1; + } + + testutil_lipsum_generate(databuf, filesize, 0); + + diag("Starting API transfer tests"); + + /** + * Overview of test workflow: + * (1) create new source file for transfer (testfile1) + * (2) write and sync source file + * (3) stat source file to verify size + * (4) parallel copy transfer of source file to testfile2 + * (5) parallel move transfer of source file to testfile3 + * (6) verify source file has been removed due to (5) + * (7) read and check full contents of both destination files + */ + + /* (1) create new source file for transfer (testfile1) */ + + int t1_flags = 0; + unifyfs_gfid t1_gfid = UNIFYFS_INVALID_GFID; + rc = unifyfs_create(*fshdl, t1_flags, testfile1, &t1_gfid); + ok((rc == UNIFYFS_SUCCESS) && (t1_gfid != UNIFYFS_INVALID_GFID), + "%s:%d unifyfs_create(%s) is successful: gfid=%u rc=%d (%s)", + __FILE__, __LINE__, testfile1, (unsigned int)t1_gfid, + rc, unifyfs_rc_enum_description(rc)); + + /* (2) write and sync source file */ + + unifyfs_io_request t1_writes[n_chks + 1]; + for (size_t i = 0; i < n_chks; i++) { + t1_writes[i].op = UNIFYFS_IOREQ_OP_WRITE; + t1_writes[i].gfid = t1_gfid; + t1_writes[i].nbytes = chksize; + t1_writes[i].offset = (off_t)(i * chksize); + t1_writes[i].user_buf = databuf + (i * chksize); + } + t1_writes[n_chks].op = UNIFYFS_IOREQ_OP_SYNC_META; + t1_writes[n_chks].gfid = t1_gfid; + + rc = unifyfs_dispatch_io(*fshdl, n_chks + 1, t1_writes); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks + 1, t1_writes, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + /* (3) stat source file to verify size */ + + unifyfs_status t1_status = {0}; + rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); + /* expected size=filesize since writes have been synced */ + ok((rc == UNIFYFS_SUCCESS) && (t1_status.global_file_size == filesize), + "%s:%d unifyfs_stat(gfid=%u) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, (unsigned int)t1_gfid, + t1_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + + /* (4) parallel copy transfer of source file to testfile2 */ + + unifyfs_transfer_request t2_transfer = {0}; + t2_transfer.src_path = testfile1; + t2_transfer.dst_path = testfile2; + t2_transfer.mode = UNIFYFS_TRANSFER_MODE_COPY; + t2_transfer.use_parallel = 1; + rc = unifyfs_dispatch_transfer(*fshdl, 1, &t2_transfer); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_transfer(%s -> %s, COPY, PARALLEL) succeeds:" + " rc=%d (%s)", __FILE__, __LINE__, testfile1, testfile2, + rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_transfer(*fshdl, 1, &t2_transfer, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_transfer(%s -> %s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, testfile2, + rc, unifyfs_rc_enum_description(rc)); + + /* (5) parallel move transfer of source file to testfile3 */ + + unifyfs_transfer_request t3_transfer = {0}; + t3_transfer.src_path = testfile1; + t3_transfer.dst_path = testfile3; + t3_transfer.mode = UNIFYFS_TRANSFER_MODE_MOVE; + t3_transfer.use_parallel = 1; + rc = unifyfs_dispatch_transfer(*fshdl, 1, &t3_transfer); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_transfer(%s -> %s, MOVE, PARALLEL) succeeds:" + " rc=%d (%s)", __FILE__, __LINE__, testfile1, testfile3, + rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_transfer(*fshdl, 1, &t3_transfer, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_transfer(%s -> %s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, testfile3, + rc, unifyfs_rc_enum_description(rc)); + + /* (6) verify source file has been removed due to move transfer in (5) */ + + rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); + /* expect EINVAL as testfile1/t1_gfid should no longer exist */ + ok(rc == EINVAL, + "%s:%d unifyfs_stat(gfid=%u) fails with EINVAL: rc=%d (%s)", + __FILE__, __LINE__, (unsigned int)t1_gfid, + rc, unifyfs_rc_enum_description(rc)); + if (rc != EINVAL) { + /* move transfer failed to remove source, try explicit remove */ + rc = unifyfs_remove(*fshdl, testfile1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + } + + /* (7) read and check full contents of both destination files */ + + errno = 0; + rc = open(testfile2, O_RDONLY); + err = errno; + ok(rc != -1 && err == 0, + "%s:%d open(%s, RDONLY) is successful: fd=%d (%s)", + __FILE__, __LINE__, testfile2, rc, strerror(err)); + if (rc != -1) { + int t2_fd = rc; + memset(readbuf, (int)'?', filesize); + struct aiocb t2_reads[n_chks]; + struct aiocb* t2_list[n_chks]; + memset(t2_reads, 0, sizeof(t2_reads)); + for (size_t i = 0; i < n_chks; i++) { + t2_list[i] = t2_reads + i; + t2_reads[i].aio_lio_opcode = LIO_READ; + t2_reads[i].aio_fildes = t2_fd; + t2_reads[i].aio_nbytes = chksize; + t2_reads[i].aio_offset = (off_t)(i * chksize); + t2_reads[i].aio_buf = readbuf + (i * chksize); + } + + errno = 0; + rc = lio_listio(LIO_WAIT, t2_list, (int)n_chks, NULL); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lio_listio(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, strerror(err)); + + for (size_t i = 0; i < n_chks; i++) { + const char* buf = (const char*) t2_reads[i].aio_buf; + size_t bytes = t2_reads[i].aio_nbytes; + off_t off = t2_reads[i].aio_offset; + + /* check read operation status */ + err = aio_error(t2_list[i]); + size_t cnt = aio_return(t2_list[i]); + ok((err == 0) && (cnt == bytes), + "%s:%d read(%s, offset=%zu, sz=%zu) is successful: count=%zd," + " rc=%d (%s)", __FILE__, __LINE__, testfile2, (size_t)off, + bytes, (ssize_t)cnt, err, strerror(err)); + + /* check valid data */ + uint64_t error_offset; + int check = testutil_lipsum_check(buf, (uint64_t)bytes, + (uint64_t)off, &error_offset); + ok(check == 0, + "%s:%d read(%s, offset=%zu, sz=%zu) data check is successful", + __FILE__, __LINE__, testfile2, (size_t)off, bytes); + } + + } + + errno = 0; + rc = open(testfile3, O_RDONLY); + err = errno; + ok(rc != -1 && err == 0, + "%s:%d open(%s, RDONLY) is successful: fd=%d (%s)", + __FILE__, __LINE__, testfile3, rc, strerror(err)); + if (rc != -1) { + int t3_fd = rc; + memset(readbuf, (int)'?', filesize); + struct aiocb t3_reads[n_chks]; + struct aiocb* t3_list[n_chks]; + memset(t3_reads, 0, sizeof(t3_reads)); + for (size_t i = 0; i < n_chks; i++) { + t3_list[i] = t3_reads + i; + t3_reads[i].aio_lio_opcode = LIO_READ; + t3_reads[i].aio_fildes = t3_fd; + t3_reads[i].aio_nbytes = chksize; + t3_reads[i].aio_offset = (off_t)(i * chksize); + t3_reads[i].aio_buf = readbuf + (i * chksize); + } + + errno = 0; + rc = lio_listio(LIO_WAIT, t3_list, (int)n_chks, NULL); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d lio_listio(%s, OP_READ) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, rc, strerror(err)); + + for (size_t i = 0; i < n_chks; i++) { + const char* buf = (const char*) t3_reads[i].aio_buf; + size_t bytes = t3_reads[i].aio_nbytes; + off_t off = t3_reads[i].aio_offset; + + /* check read operation status */ + err = aio_error(t3_list[i]); + size_t cnt = aio_return(t3_list[i]); + ok((err == 0) && (cnt == bytes), + "%s:%d read(%s, offset=%zu, sz=%zu) is successful: count=%zd," + " rc=%d (%s)", __FILE__, __LINE__, testfile3, (size_t)off, + bytes, (ssize_t)cnt, err, strerror(err)); + + /* check valid data */ + uint64_t error_offset; + int check = testutil_lipsum_check(buf, (uint64_t)bytes, + (uint64_t)off, &error_offset); + ok(check == 0, + "%s:%d read(%s, offset=%zu, sz=%zu) data check is successful", + __FILE__, __LINE__, testfile3, (size_t)off, bytes); + } + } + + diag("Finished API transfer tests"); + + //------------- + + diag("Removing test files"); + + errno = 0; + rc = remove(testfile2); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, err, strerror(err)); + + errno = 0; + rc = remove(testfile3); + err = errno; + ok(rc == 0 && err == 0, + "%s:%d remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile3, err, strerror(err)); + + //------------- + + return 0; +} diff --git a/t/api/write-read-sync-stat.c b/t/api/write-read-sync-stat.c index fe6f42549..3e72fa993 100644 --- a/t/api/write-read-sync-stat.c +++ b/t/api/write-read-sync-stat.c @@ -14,7 +14,7 @@ #include -#include "client_api_suite.h" +#include "api_suite.h" int api_write_read_sync_stat_test(char* unifyfs_root, unifyfs_handle* fshdl, diff --git a/t/lib/testutil.c b/t/lib/testutil.c index 0d4e3b4c6..17c3dec9d 100644 --- a/t/lib/testutil.c +++ b/t/lib/testutil.c @@ -83,26 +83,49 @@ void testutil_rand_path(char* buf, size_t len, const char* pfx) testutil_rand_string(buf + rc, len - rc); } +/* + * Return a pointer to the path name of the test temp directory. Use the + * value of the environment variable UNIFYFS_TEST_TMPDIR if it exists, + * otherwise use P_tmpdir (defined in stdio.h, typically '/tmp'). + */ +char* testutil_get_tmp_dir(void) +{ + char* path; + char* val = getenv("UNIFYFS_TEST_TMPDIR"); + + if (val != NULL) { + path = val; + } else { + path = P_tmpdir; + } + + return path; +} + /* * Return a pointer to the path name of the UnifyFS mount point. Use the * value of the environment variable UNIFYFS_MOUNTPOINT if it exists, - * otherwise use P_tmpdir which is defined in stdio.h and is typically - * /tmp. + * otherwise use 'tmpdir/unifyfs'. */ char* testutil_get_mount_point(void) { char* path; - char* env = getenv("UNIFYFS_MOUNTPOINT"); + char* val = getenv("UNIFYFS_MOUNTPOINT"); - if (env != NULL) { - path = env; + if (val != NULL) { + path = val; } else { - path = P_tmpdir; + char* tmpdir = testutil_get_tmp_dir(); + size_t path_len = strlen(tmpdir) + strlen("/unifyfs") + 1; + path = malloc(path_len); + snprintf(path, path_len, "%s/unifyfs", tmpdir); } return path; } + + /* Stat the file associated to by path and store the global size of the * file at path in the address of the global pointer passed in. */ void testutil_get_size(char* path, size_t* global) diff --git a/t/lib/testutil.h b/t/lib/testutil.h index b138cba45..d35f79bc9 100644 --- a/t/lib/testutil.h +++ b/t/lib/testutil.h @@ -26,11 +26,17 @@ void testutil_rand_string(char* buf, size_t len); */ void testutil_rand_path(char* buf, size_t len, const char* pfx); +/* + * Return a pointer to the path name of the test temp directory. Use the + * value of the environment variable UNIFYFS_TEST_TMPDIR if it exists, + * otherwise use P_tmpdir (defined in stdio.h, typically '/tmp'). + */ +char* testutil_get_tmp_dir(void); + /* * Return a pointer to the path name of the UnifyFS mount point. Use the - * value of the environment variable UNIFYFS_MOUNT_POINT if it exists, - * otherwise use P_tmpdir which is defined in stdio.h and is typically - * /tmp. + * value of the environment variable UNIFYFS_MOUNTPOINT if it exists, + * otherwise use 'tmpdir/unifyfs'. */ char* testutil_get_mount_point(void); diff --git a/t/sharness.d/02-functions.sh b/t/sharness.d/02-functions.sh index e765e8a51..68fe08a32 100644 --- a/t/sharness.d/02-functions.sh +++ b/t/sharness.d/02-functions.sh @@ -107,8 +107,19 @@ unifyfsd_dump_state() fi done fi - # print out dumpfile contents to current test log - cat $dumpfile >&3 + + server_log=$UNIFYFS_TEST_TMPDIR/unifyfsd.stdlog + if [ -f $server_log ]; then + echo "Dumping server stdout/err file contents :" >> $dumpfile + echo "========= $server_log ==========" >> $dumpfile + cat $server_log >> $dumpfile + echo "+++++++++++++++++++++++" >> $dumpfile + echo >> $dumpfile + fi + + # copy dumpfile to /tmp for later access + cp $dumpfile /tmp + return 0 } diff --git a/t/sharness.sh b/t/sharness.sh index 787c174b2..a23888982 100644 --- a/t/sharness.sh +++ b/t/sharness.sh @@ -658,7 +658,7 @@ test_path_is_dir () { # Check if the directory exists and is empty as expected, barf otherwise. test_dir_is_empty () { test_path_is_dir "$1" && - if test -n "$(ls -a1 "$1" | egrep -v '^\.\.?$')" + if test -n "$(ls -a1 "$1" | egrep -v '^\.\.?/?$')" then echo "Directory '$1' is not empty, it contains:" ls -la "$1" diff --git a/t/std/stdio_suite.c b/t/std/stdio_suite.c index 7fd8b980b..a87d57d2c 100644 --- a/t/std/stdio_suite.c +++ b/t/std/stdio_suite.c @@ -53,7 +53,7 @@ int main(int argc, char* argv[]) unifyfs_root = testutil_get_mount_point(); /* Verify unifyfs_mount succeeds. */ - rc = unifyfs_mount(unifyfs_root, rank, rank_num, 0); + rc = unifyfs_mount(unifyfs_root, rank, rank_num); ok(rc == 0, "unifyfs_mount(%s) (rc=%d)", unifyfs_root, rc); if (rc != 0) { diff --git a/t/sys/statfs_suite.c b/t/sys/statfs_suite.c index 8dd0e0e02..971c316b8 100644 --- a/t/sys/statfs_suite.c +++ b/t/sys/statfs_suite.c @@ -40,7 +40,7 @@ int main(int argc, char* argv[]) unifyfs_root = testutil_get_mount_point(); /* Verify unifyfs_mount succeeds. */ - rc = unifyfs_mount(unifyfs_root, rank, rank_num, 0); + rc = unifyfs_mount(unifyfs_root, rank, rank_num); ok(rc == 0, "unifyfs_mount(%s) (rc=%d)", unifyfs_root, rc); /* If the mount fails, bailout, as there is no point in running the tests */ diff --git a/t/sys/sysio_suite.c b/t/sys/sysio_suite.c index 99a172295..89c8f6cf3 100644 --- a/t/sys/sysio_suite.c +++ b/t/sys/sysio_suite.c @@ -53,7 +53,7 @@ int main(int argc, char* argv[]) unifyfs_root = testutil_get_mount_point(); /* Verify unifyfs_mount succeeds. */ - rc = unifyfs_mount(unifyfs_root, rank, rank_num, 0); + rc = unifyfs_mount(unifyfs_root, rank, rank_num); ok(rc == 0, "unifyfs_mount(%s) (rc=%d)", unifyfs_root, rc); /* If the mount fails, bailout, as there is no point in running the tests */ diff --git a/t/unifyfs_unmount.c b/t/unifyfs_unmount.c index a90d363df..d01e1fd52 100644 --- a/t/unifyfs_unmount.c +++ b/t/unifyfs_unmount.c @@ -22,7 +22,7 @@ int main(int argc, char* argv[]) /* * Verify unifyfs_mount succeeds. */ - rc = unifyfs_mount(unifyfs_root, rank, rank_num, 0); + rc = unifyfs_mount(unifyfs_root, rank, rank_num); ok(rc == 0, "unifyfs_mount at %s (rc=%d)", unifyfs_root, rc); rc = unifyfs_unmount(); diff --git a/util/unifyfs-stage/src/unifyfs-stage.c b/util/unifyfs-stage/src/unifyfs-stage.c index 9eb5dd6dc..947d58e3f 100644 --- a/util/unifyfs-stage/src/unifyfs-stage.c +++ b/util/unifyfs-stage/src/unifyfs-stage.c @@ -283,7 +283,7 @@ int main(int argc, char** argv) } if (should_we_mount_unifyfs && !ctx->enable_mpi_mount) { - ret = unifyfs_mount(mountpoint, rank, total_ranks, 0); + ret = unifyfs_mount(mountpoint, rank, total_ranks); if (ret) { fprintf(stderr, "failed to mount unifyfs at %s (%s)", ctx->mountpoint, strerror(ret)); diff --git a/util/unifyfs/src/unifyfs-rm.c b/util/unifyfs/src/unifyfs-rm.c index 0e6368fa1..37319347b 100644 --- a/util/unifyfs/src/unifyfs-rm.c +++ b/util/unifyfs/src/unifyfs-rm.c @@ -212,7 +212,7 @@ static int wait_server_initialization(unifyfs_resource_t* resource, return_val_from_scnprintf = scnprintf(filename, PATH_MAX, - "%s/%s", args->share_dir, UNIFYFSD_PID_FILENAME); + "%s/%s", args->share_dir, UNIFYFS_SERVER_PID_FILENAME); if (return_val_from_scnprintf > (PATH_MAX - 2)) { fprintf(stderr, "Unifyfs status filename is too long!\n"); return -ENOMEM; @@ -357,7 +357,7 @@ static int remove_server_pid_file(unifyfs_args_t* args) return_val_from_scnprintf = scnprintf(filename, PATH_MAX, - "%s/%s", args->share_dir, UNIFYFSD_PID_FILENAME); + "%s/%s", args->share_dir, UNIFYFS_SERVER_PID_FILENAME); if (return_val_from_scnprintf > (PATH_MAX - 2)) { fprintf(stderr, "Unifyfs status filename is too long!\n"); return -ENOMEM; From 4eb6324ff4fbfca632c193cab795b918f0ce0ecf Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Tue, 27 Jul 2021 10:01:20 -0400 Subject: [PATCH 30/81] Fix bug in unifyfs_fskv_init() If unifyfsd was started with specifying the shared dir, unifyfs_fskv_init() would not actually initialize the K/V store, but it would return success. This commit adds code to check for that condition and return an error (and write an appropriate message to the error log). Also, a couple of tweaks to match the UnifyFS style guide. --- common/src/unifyfs_keyval.c | 66 ++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index ddd3de4da..4975a4794 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -516,7 +516,7 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) err = errno; if ((rc != 0) && (err != EEXIST)) { LOGERR("failed to create local kvstore directory %s - %s", - localfs_kvdir, strerror(err)); + localfs_kvdir, strerror(err)); return (int)UNIFYFS_ERROR_KEYVAL; } } else { @@ -525,39 +525,45 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) } } - if ((UNIFYFS_SERVER == cfg->ptype) && (NULL != cfg->sharedfs_dir)) { - // find or create shared kvstore directory - snprintf(sharedfs_kvdir, sizeof(sharedfs_kvdir), "%s/kvstore", - cfg->sharedfs_dir); - memset(&s, 0, sizeof(struct stat)); - rc = stat(sharedfs_kvdir, &s); - if (rc != 0) { - // try to create it - rc = mkdir(sharedfs_kvdir, 0770); - err = errno; - if ((rc != 0) && (err != EEXIST)) { - LOGERR("failed to create kvstore directory %s - %s", - sharedfs_kvdir, strerror(err)); - return (int)UNIFYFS_ERROR_KEYVAL; + if (UNIFYFS_SERVER == cfg->ptype) { + if (NULL != cfg->sharedfs_dir) { + // find or create shared kvstore directory + snprintf(sharedfs_kvdir, sizeof(sharedfs_kvdir), "%s/kvstore", + cfg->sharedfs_dir); + memset(&s, 0, sizeof(struct stat)); + rc = stat(sharedfs_kvdir, &s); + if (rc != 0) { + // try to create it + rc = mkdir(sharedfs_kvdir, 0770); + err = errno; + if ((rc != 0) && (err != EEXIST)) { + LOGERR("failed to create kvstore directory %s - %s", + sharedfs_kvdir, strerror(err)); + return (int)UNIFYFS_ERROR_KEYVAL; + } } - } - // find or create rank-specific subdir - scnprintf(sharedfs_rank_kvdir, sizeof(sharedfs_rank_kvdir), "%s/%d", - sharedfs_kvdir, kv_myrank); - memset(&s, 0, sizeof(struct stat)); - rc = stat(sharedfs_rank_kvdir, &s); - if (rc != 0) { - // try to create it - rc = mkdir(sharedfs_rank_kvdir, 0770); - err = errno; - if ((rc != 0) && (err != EEXIST)) { - LOGERR("failed to create rank kvstore directory %s - %s", - sharedfs_rank_kvdir, strerror(err)); - return (int)UNIFYFS_ERROR_KEYVAL; + // find or create rank-specific subdir + scnprintf(sharedfs_rank_kvdir, sizeof(sharedfs_rank_kvdir), "%s/%d", + sharedfs_kvdir, kv_myrank); + memset(&s, 0, sizeof(struct stat)); + rc = stat(sharedfs_rank_kvdir, &s); + if (rc != 0) { + // try to create it + rc = mkdir(sharedfs_rank_kvdir, 0770); + err = errno; + if ((rc != 0) && (err != EEXIST)) { + LOGERR("failed to create rank kvstore directory %s - %s", + sharedfs_rank_kvdir, strerror(err)); + return (int)UNIFYFS_ERROR_KEYVAL; + } } + have_sharedfs_kvstore = 1; + } else { + // Server process, but nobody specified the sharedfs dir + LOGERR("can't create kvstore - sharedfs not specified"); + return (int)UNIFYFS_ERROR_KEYVAL; } - have_sharedfs_kvstore = 1; } kv_max_keylen = UNIFYFS_MAX_KV_KEYLEN; From 5db75c9a6d894cd393424072941057aa0a50bc4d Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Mon, 26 Jul 2021 16:57:59 -0400 Subject: [PATCH 31/81] ensure logio data is synced on user fsync() TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.h" --- client/src/unifyfs-sysio.c | 17 ++++++++++++----- client/src/unifyfs_api.c | 16 ++++++++++++++-- client/src/unifyfs_api_internal.h | 1 + client/src/unifyfs_fid.c | 28 ++++++++++++++++++++++++++++ client/src/unifyfs_fid.h | 8 +++++++- common/src/unifyfs_configurator.h | 1 + docs/configuration.rst | 3 ++- 7 files changed, 65 insertions(+), 9 deletions(-) diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index 72d3fda35..99a69159b 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -1933,13 +1933,20 @@ int UNIFYFS_WRAP(fsync)(int fd) } /* invoke fsync rpc to register index metadata with server */ - int ret = unifyfs_fid_sync_extents(posix_client, fid); - if (ret != UNIFYFS_SUCCESS) { - /* sync failed for some reason, set errno and return error */ - errno = unifyfs_rc_errno(ret); + int rc = unifyfs_fid_sync_extents(posix_client, fid); + if (rc != UNIFYFS_SUCCESS) { + /* metadata sync failed, set errno and return error */ + errno = unifyfs_rc_errno(rc); return -1; + } else if (posix_client->use_fsync_persist) { + /* now sync file data to storage */ + rc = unifyfs_fid_sync_data(posix_client, fid); + if (rc != UNIFYFS_SUCCESS) { + /* data sync failed, set errno and return error */ + errno = unifyfs_rc_errno(rc); + return -1; + } } - return 0; } else { MAP_OR_FAIL(fsync); diff --git a/client/src/unifyfs_api.c b/client/src/unifyfs_api.c index 97d87e171..bf234ae69 100644 --- a/client/src/unifyfs_api.c +++ b/client/src/unifyfs_api.c @@ -123,9 +123,21 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, } } + /* Determine whether we persist data to storage device on fsync(). + * Turning this setting off speeds up fsync() by only syncing the + * extent metadata, but it violates POSIX semanatics. */ + client->use_fsync_persist = true; + cfgval = client_cfg->client_fsync_persist; + if (cfgval != NULL) { + rc = configurator_bool_val(cfgval, &b); + if (rc == 0) { + client->use_fsync_persist = (bool)b; + } + } + /* Determine whether we automatically sync every write to server. - * This slows write performance, but it can serve as a work - * around for apps that do not have all necessary syncs. */ + * Turning this setting on slows write performance, but it can serve + * as a workaround for apps that do not have all the necessary syncs. */ client->use_write_sync = false; cfgval = client_cfg->client_write_sync; if (cfgval != NULL) { diff --git a/client/src/unifyfs_api_internal.h b/client/src/unifyfs_api_internal.h index 6eecff3b4..086745a34 100644 --- a/client/src/unifyfs_api_internal.h +++ b/client/src/unifyfs_api_internal.h @@ -60,6 +60,7 @@ typedef struct unifyfs_client { /* mountpoint configuration */ unifyfs_cfg_t cfg; /* user-provided configuration */ + bool use_fsync_persist; /* persist data to storage on fsync() */ bool use_local_extents; /* enable tracking of local extents */ bool use_write_sync; /* sync for every write operation */ bool use_unifyfs_magic; /* return UNIFYFS (true) or TMPFS (false) diff --git a/client/src/unifyfs_fid.c b/client/src/unifyfs_fid.c index d1710bdfb..a990a5995 100644 --- a/client/src/unifyfs_fid.c +++ b/client/src/unifyfs_fid.c @@ -984,6 +984,34 @@ static off_t rewrite_index_from_seg_tree(unifyfs_client* client, return max_log_offset; } +/* Sync extent data for file to storage */ +int unifyfs_fid_sync_data(unifyfs_client* client, + int fid) +{ + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); + if ((NULL == meta) || (meta->fid != fid)) { + /* bail out with an error if we fail to find it */ + LOGERR("missing filemeta for fid=%d", fid); + return UNIFYFS_FAILURE; + } + + /* sync file data to storage. + * NOTE: this syncs all client data, not just the target file's */ + int rc = unifyfs_logio_sync(client->state.logio_ctx); + if (UNIFYFS_SUCCESS != rc) { + /* something went wrong when trying to flush extents */ + LOGERR("failed to flush data to storage for client[%d:%d]", + client->state.app_id, client->state.client_id); + ret = rc; + } + + return ret; +} + + /* Sync data for file to server if needed */ int unifyfs_fid_sync_extents(unifyfs_client* client, int fid) diff --git a/client/src/unifyfs_fid.h b/client/src/unifyfs_fid.h index 66c9e3af0..302a4212d 100644 --- a/client/src/unifyfs_fid.h +++ b/client/src/unifyfs_fid.h @@ -145,8 +145,14 @@ int unifyfs_fid_truncate(unifyfs_client* client, int fid, off_t length); -/* Sync extent data for file to server if needed */ +/* Sync extent data for file to storage */ +int unifyfs_fid_sync_data(unifyfs_client* client, + int fid); + +/* Sync extent metadata for file to server if needed */ int unifyfs_fid_sync_extents(unifyfs_client* client, int fid); + + #endif /* UNIFYFS_FID_H */ diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index 2c12eda64..aa7965091 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -70,6 +70,7 @@ UNIFYFS_CFG_CLI(unifyfs, daemonize, BOOL, on, "enable server daemonization", NULL, 'D', "on|off") \ UNIFYFS_CFG_CLI(unifyfs, mountpoint, STRING, /unifyfs, "mountpoint directory", NULL, 'm', "specify full path to desired mountpoint") \ UNIFYFS_CFG(client, cwd, STRING, NULLSTRING, "current working directory", NULL) \ + UNIFYFS_CFG(client, fsync_persist, BOOL, on, "persist written data to storage on fsync()", NULL) \ UNIFYFS_CFG(client, local_extents, BOOL, off, "track extents to service reads of local data", NULL) \ UNIFYFS_CFG(client, max_files, INT, UNIFYFS_CLIENT_MAX_FILES, "client max file count", NULL) \ UNIFYFS_CFG(client, write_index_size, INT, UNIFYFS_CLIENT_WRITE_INDEX_SIZE, "write metadata index buffer size", NULL) \ diff --git a/docs/configuration.rst b/docs/configuration.rst index 5383729ed..675bf6291 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -67,8 +67,9 @@ a given section and key. Key Type Description ================ ====== ================================================================= cwd STRING effective starting current working directory - max_files INT maximum number of open files per client process (default: 128) + fsync_persist BOOL persist data to storage on fsync() (default: on) local_extents BOOL service reads from local data if possible (default: off) + max_files INT maximum number of open files per client process (default: 128) super_magic BOOL whether to return UNIFYFS (on) or TMPFS (off) statfs magic (default: on) write_index_size INT maximum size (B) of memory buffer for storing write log metadata write_sync BOOL sync data to server after every write (default: off) From 566dd04e00598f573da9d4b732646e7448d35878 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Wed, 28 Jul 2021 09:24:44 -0400 Subject: [PATCH 32/81] release logio storage for extents on file unlink Each server is responsible for releasing the local extents for the target file. For extents that have not yet been synced to the local server, the client must release the storage. Also includes a bug fix for extents that happened to span the shmem and spill portions of the log. The bug was revealed by the new library API test that makes sure we can properly reclaim storage for deleted files (t/api/storage-reuse.c). --- client/src/unifyfs_fid.c | 18 +++ common/src/slotmap.c | 55 +++++++-- common/src/slotmap.h | 2 + common/src/unifyfs_logio.c | 51 +++++--- server/src/unifyfs_inode.c | 82 +++++++++---- t/Makefile.am | 15 ++- t/api/api_suite.c | 11 +- t/api/api_suite.h | 8 ++ t/api/storage-reuse.c | 232 +++++++++++++++++++++++++++++++++++++ 9 files changed, 419 insertions(+), 55 deletions(-) create mode 100644 t/api/storage-reuse.c diff --git a/client/src/unifyfs_fid.c b/client/src/unifyfs_fid.c index a990a5995..8822f4ca1 100644 --- a/client/src/unifyfs_fid.c +++ b/client/src/unifyfs_fid.c @@ -92,6 +92,24 @@ static int fid_storage_free(unifyfs_client* client, unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); if ((meta != NULL) && (meta->fid == fid)) { if (meta->storage == FILE_STORAGE_LOGIO) { + /* client needs to release unsynced write extents, since server + * does not know about them */ + seg_tree_rdlock(&meta->extents_sync); + struct seg_tree_node* node = NULL; + while ((node = seg_tree_iter(&meta->extents_sync, node))) { + size_t nbytes = (size_t) (node->end - node->start + 1); + off_t log_offset = (off_t) node->ptr; + int rc = unifyfs_logio_free(client->state.logio_ctx, + log_offset, nbytes); + if (UNIFYFS_SUCCESS != rc) { + LOGERR("failed to free logio allocation for " + "client[%d:%d] log_offset=%zu nbytes=%zu", + client->state.app_id, client->state.client_id, + log_offset, nbytes); + } + } + seg_tree_unlock(&meta->extents_sync); + /* Free our write seg_tree */ seg_tree_destroy(&meta->extents_sync); diff --git a/common/src/slotmap.c b/common/src/slotmap.c index 408a36b3b..a39d881c5 100644 --- a/common/src/slotmap.c +++ b/common/src/slotmap.c @@ -157,6 +157,8 @@ int slotmap_clear(slot_map* smap) /* set used to zero */ smap->used_slots = 0; + smap->first_used_slot = -1; + smap->last_used_slot = -1; /* zero-out use map */ uint8_t* usemap = get_use_map(smap); @@ -228,9 +230,9 @@ ssize_t slotmap_reserve(slot_map* smap, /* search for contiguous free slots */ size_t search_start = 0; - if (slot_bytes > 1) { - /* skip past (likely) used slots */ - search_start = SLOT_BYTE(smap->used_slots); + if ((smap->last_used_slot != -1) && (slot_bytes > 1)) { + /* skip past likely-used slots */ + search_start = SLOT_BYTE(smap->last_used_slot); } uint8_t* usemap = get_use_map(smap); size_t map_bytes = slot_map_bytes(smap->total_slots); @@ -292,8 +294,17 @@ ssize_t slotmap_reserve(slot_map* smap, if (found_start) { /* success, reserve bits in consecutive slots */ - for (size_t i = 0; i < num_slots; i++) { - use_slot(usemap, start_slot + i); + size_t end_slot = start_slot + num_slots - 1; + for (size_t i = start_slot; i <= end_slot; i++) { + use_slot(usemap, i); + } + if ((smap->first_used_slot == -1) || + (start_slot < smap->first_used_slot)) { + smap->first_used_slot = start_slot; + } + if ((smap->last_used_slot == -1) || + (end_slot > smap->last_used_slot)) { + smap->last_used_slot = end_slot; } smap->used_slots += num_slots; return (ssize_t)start_slot; @@ -328,11 +339,41 @@ int slotmap_release(slot_map* smap, } /* release the slots */ - for (size_t i = 0; i < num_slots; i++) { - release_slot(usemap, start_index + i); + size_t end_slot = start_index + num_slots - 1; + for (size_t i = start_index; i <= end_slot; i++) { + release_slot(usemap, i); } smap->used_slots -= num_slots; + if (smap->used_slots == 0) { + smap->first_used_slot = -1; + smap->last_used_slot = -1; + return UNIFYFS_SUCCESS; + } + + /* find new first-used slot if necessary */ + if (start_index == smap->first_used_slot) { + ssize_t first_slot = end_slot + 1; + while ((first_slot < smap->total_slots) && + (!check_slot(usemap, (size_t)first_slot))) { + first_slot++; + } + if (first_slot == smap->total_slots) { + first_slot = -1; + } + smap->last_used_slot = first_slot; + } + + /* find new last-used slot if necessary */ + if (end_slot == smap->last_used_slot) { + ssize_t last_slot = start_index - 1; + while ((last_slot >= 0) && + (!check_slot(usemap, (size_t)last_slot))) { + last_slot--; + } + smap->last_used_slot = last_slot; + } + return UNIFYFS_SUCCESS; } diff --git a/common/src/slotmap.h b/common/src/slotmap.h index ea937ff91..b23c74498 100644 --- a/common/src/slotmap.h +++ b/common/src/slotmap.h @@ -25,6 +25,8 @@ extern "C" { typedef struct slot_map { size_t total_slots; size_t used_slots; + ssize_t first_used_slot; + ssize_t last_used_slot; } slot_map; /* The slot usage bitmap immediately follows the structure in memory. diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c index 29f212f33..1ce9663d5 100644 --- a/common/src/unifyfs_logio.c +++ b/common/src/unifyfs_logio.c @@ -39,7 +39,6 @@ typedef struct log_header { size_t data_sz; /* total data bytes in log */ size_t reserved_sz; /* reserved data bytes */ size_t chunk_sz; /* data chunk size */ - size_t max_reserved_slot; /* slot index for last reserved chunk */ off_t data_offset; /* file/memory offset where data chunks start */ } log_header; /* chunk slot_map immediately follows header and occupies rest of the page */ @@ -136,7 +135,10 @@ static int get_spillfile(const char* path, } /* map log header (1st page) of spill file given by file descriptor */ -static void* map_spillfile(int spill_fd, int mmap_prot, int n_pages) +static void* map_spillfile(int spill_fd, + int mmap_prot, + int n_pages, + int server) { int err; size_t pgsz = get_page_size(); @@ -152,7 +154,7 @@ static void* map_spillfile(int spill_fd, int mmap_prot, int n_pages) return NULL; } - if (mmap_prot == PROT_READ) { /* server maps for read only */ + if (server) { log_header* loghdr = (log_header*) addr; size_t hdr_sz = loghdr->hdr_sz; if (hdr_sz > mapsz) { @@ -184,6 +186,7 @@ int unifyfs_logio_init_server(const int app_id, } *pctx = NULL; + log_header* hdr = NULL; shm_context* shm_ctx = NULL; if (mem_size) { /* attach to client shmem region */ @@ -195,6 +198,9 @@ int unifyfs_logio_init_server(const int app_id, LOGERR("Failed to attach logio shmem buffer!"); return UNIFYFS_ERROR_SHMEM; } + hdr = (log_header*) shm_ctx->addr; + LOGDBG("shmem header - hdr_sz=%zu, data_sz=%zu, data_offset=%zu", + hdr->hdr_sz, hdr->data_sz, hdr->data_offset); } char spillfile[UNIFYFS_MAX_FILENAME]; @@ -215,12 +221,16 @@ int unifyfs_logio_init_server(const int app_id, return UNIFYFS_FAILURE; } else { /* map the start of the spill-over file, which contains log header - * and chunk slot_map. server only needs read access. */ - spill_mapping = map_spillfile(spill_fd, PROT_READ, 1); + * and chunk slot_map. server needs read and write access */ + int map_flags = PROT_READ | PROT_WRITE; + spill_mapping = map_spillfile(spill_fd, map_flags, 1, 1); if (NULL == spill_mapping) { LOGERR("Failed to map logio spill file header!"); return UNIFYFS_FAILURE; } + hdr = (log_header*) spill_mapping; + LOGDBG("spill header - hdr_sz=%zu, data_sz=%zu, data_offset=%zu", + hdr->hdr_sz, hdr->data_sz, hdr->data_offset); } } @@ -278,8 +288,8 @@ static int init_log_header(char* log_region, } /* chunk data starts after header pages */ - data_size = region_size - hdr_size; - size_t n_chunks = data_size / chunk_size; + size_t data_space = region_size - hdr_size; + size_t n_chunks = data_space / chunk_size; /* try to init chunk slotmap */ size_t slotmap_size = hdr_size - sizeof(log_header); @@ -290,6 +300,10 @@ static int init_log_header(char* log_region, hdr_pages++; continue; } + + /* the data_size is an exact multiple of chunk_size, which may be + * slightly less than the data_space */ + data_size = n_chunks * chunk_size; break; } @@ -355,6 +369,9 @@ int unifyfs_logio_init_client(const int app_id, LOGERR("Failed to initialize shmem logio header"); return rc; } + log_header* hdr = (log_header*) memlog; + LOGDBG("shmem header - hdr_sz=%zu, data_sz=%zu, data_offset=%zu", + hdr->hdr_sz, hdr->data_sz, hdr->data_offset); } /* will we use spillover to store the files? */ @@ -405,7 +422,7 @@ int unifyfs_logio_init_client(const int app_id, /* map start of the spill-over file, which contains log header * and chunk slot_map. client needs read and write access. */ int map_flags = PROT_READ | PROT_WRITE; - spill_mapping = map_spillfile(spill_fd, map_flags, n_pages); + spill_mapping = map_spillfile(spill_fd, map_flags, n_pages, 0); if (NULL == spill_mapping) { LOGERR("Failed to map logio spill file header!"); return UNIFYFS_FAILURE; @@ -418,6 +435,9 @@ int unifyfs_logio_init_client(const int app_id, LOGERR("Failed to initialize spill logio header"); return rc; } + log_header* hdr = (log_header*) spill; + LOGDBG("spill header - hdr_sz=%zu, data_sz=%zu, data_offset=%zu", + hdr->hdr_sz, hdr->data_sz, hdr->data_offset); } } @@ -541,7 +561,6 @@ int unifyfs_logio_alloc(logio_context* ctx, /* success, all needed chunks allocated in shmem */ allocated_bytes = res_chunks * chunk_sz; shmem_hdr->reserved_sz += allocated_bytes; - shmem_hdr->max_reserved_slot = (res_slot + res_chunks) - 1; res_off = (off_t)(res_slot * chunk_sz); *log_offset = res_off; return UNIFYFS_SUCCESS; @@ -550,7 +569,7 @@ int unifyfs_logio_alloc(logio_context* ctx, /* could not get full allocation in shmem, reserve any available * chunks at the end of the shmem log */ size_t log_end_chunks = chunkmap->total_slots - - (shmem_hdr->max_reserved_slot + 1); + (chunkmap->last_used_slot + 1); if (log_end_chunks > 0) { res_chunks = log_end_chunks; res_slot = slotmap_reserve(chunkmap, res_chunks); @@ -584,7 +603,6 @@ int unifyfs_logio_alloc(logio_context* ctx, if (0 == mem_res_at_end) { /* success, full reservation in spill */ spill_hdr->reserved_sz += allocated_bytes; - spill_hdr->max_reserved_slot = (res_slot + res_chunks) - 1; res_off = (off_t)(res_slot * chunk_sz); if (NULL != shmem_hdr) { /* update log offset to account for shmem log size */ @@ -622,9 +640,8 @@ int unifyfs_logio_alloc(logio_context* ctx, res_slot = slotmap_reserve(chunkmap, res_chunks); if (-1 != res_slot) { /* success, full reservation in spill */ + allocated_bytes = res_chunks * chunk_sz; spill_hdr->reserved_sz += allocated_bytes; - spill_hdr->max_reserved_slot = - (res_slot + res_chunks) - 1; res_off = (off_t)(res_slot * chunk_sz); if (NULL != shmem_hdr) { /* update log offset to include shmem log size */ @@ -636,10 +653,7 @@ int unifyfs_logio_alloc(logio_context* ctx, } else { /* successful reservation spanning shmem and spill */ shmem_hdr->reserved_sz += mem_allocation; - shmem_hdr->max_reserved_slot = - (mem_res_slot + mem_res_nchk) - 1; spill_hdr->reserved_sz += allocated_bytes; - spill_hdr->max_reserved_slot = (res_slot + res_chunks) - 1; *log_offset = res_off; return UNIFYFS_SUCCESS; } @@ -656,6 +670,7 @@ int unifyfs_logio_alloc(logio_context* ctx, LOGERR("slotmap_release() for logio shmem failed"); } } + LOGDBG("returning ENOSPC"); return ENOSPC; } @@ -689,6 +704,8 @@ int unifyfs_logio_free(logio_context* ctx, off_t spill_offset = 0; get_log_sizes(log_offset, nbytes, mem_size, &sz_in_mem, &sz_in_spill, &spill_offset); + LOGDBG("log_off=%zu, nbytes=%zu : mem_sz=%zu spill_sz=%zu spill_off=%zu", + log_offset, nbytes, sz_in_mem, sz_in_spill, (size_t)spill_offset); int rc = UNIFYFS_SUCCESS; size_t chunk_sz, chunk_slot, num_chunks; @@ -828,6 +845,8 @@ int unifyfs_logio_write(logio_context* ctx, off_t spill_offset = 0; get_log_sizes(log_offset, nbytes, mem_size, &sz_in_mem, &sz_in_spill, &spill_offset); + LOGDBG("log_off=%zu, nbytes=%zu : mem_sz=%zu spill_sz=%zu spill_off=%zu", + log_offset, nbytes, sz_in_mem, sz_in_spill, (size_t)spill_offset); /* do writes */ int err_rc = 0; diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c index cd3141e9a..e45f1cd16 100644 --- a/server/src/unifyfs_inode.c +++ b/server/src/unifyfs_inode.c @@ -50,31 +50,7 @@ struct unifyfs_inode* unifyfs_inode_alloc(int gfid, unifyfs_file_attr_t* attr) return ino; } -static inline -int unifyfs_inode_destroy(struct unifyfs_inode* ino) -{ - int ret = UNIFYFS_SUCCESS; - - if (ino) { - if (NULL != ino->attr.filename) { - free(ino->attr.filename); - } - - if (NULL != ino->extents) { - extent_tree_destroy(ino->extents); - free(ino->extents); - } - - pthread_rwlock_destroy(&(ino->rwlock)); - ABT_mutex_free(&(ino->abt_sync)); - - free(ino); - } else { - ret = EINVAL; - } - - return ret; -} +static int unifyfs_inode_destroy(struct unifyfs_inode* ino); /** * @brief read lock the inode for ro access. @@ -138,6 +114,62 @@ int unifyfs_inode_create(int gfid, unifyfs_file_attr_t* attr) return ret; } +static +int unifyfs_inode_destroy(struct unifyfs_inode* ino) +{ + int ret = UNIFYFS_SUCCESS; + + if (ino) { + if (NULL != ino->attr.filename) { + free(ino->attr.filename); + } + + if (NULL != ino->extents) { + /* iterate over extents and release local logio allocations */ + unifyfs_inode_rdlock(ino); + { + struct extent_tree* tree = ino->extents; + struct extent_tree_node* curr = NULL; + while (NULL != (curr = extent_tree_iter(tree, curr))) { + if (curr->svr_rank == glb_pmi_rank) { + /* lookup client's logio context and release + * allocation for this extent */ + int app_id = curr->app_id; + int client_id = curr->cli_id; + app_client* client = get_app_client(app_id, client_id); + if ((NULL == client) || + (NULL == client->state.logio_ctx)) { + continue; + } + logio_context* logio = client->state.logio_ctx; + size_t nbytes = (1 + (curr->end - curr->start)); + off_t log_off = curr->pos; + int rc = unifyfs_logio_free(logio, log_off, nbytes); + if (UNIFYFS_SUCCESS != rc) { + LOGERR("failed to free logio allocation for " + "client[%d:%d] log_offset=%zu nbytes=%zu", + app_id, client_id, (size_t)log_off, nbytes); + } + } + } + } + unifyfs_inode_unlock(ino); + + extent_tree_destroy(ino->extents); + free(ino->extents); + } + + pthread_rwlock_destroy(&(ino->rwlock)); + ABT_mutex_free(&(ino->abt_sync)); + + free(ino); + } else { + ret = EINVAL; + } + + return ret; +} + int unifyfs_inode_update_attr(int gfid, int attr_op, unifyfs_file_attr_t* attr) { diff --git a/t/Makefile.am b/t/Makefile.am index 57dd2ac9c..049fbc556 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -110,14 +110,16 @@ api_api_test_t_CPPFLAGS = $(test_cppflags) api_api_test_t_LDADD = $(test_api_ldadd) api_api_test_t_LDFLAGS = $(test_api_ldflags) api_api_test_t_SOURCES = \ - api/api_suite.h \ - api/api_suite.c \ - api/init-fini.c \ - api/create-open-remove.c \ - api/write-read-sync-stat.c \ - api/laminate.c \ + api/api_suite.h \ + api/api_suite.c \ + api/init-fini.c \ + api/create-open-remove.c \ + api/write-read-sync-stat.c \ + api/laminate.c \ + api/storage-reuse.c \ api/transfer.c + test_sysio_sources = \ sys/sysio_suite.h \ sys/sysio_suite.c \ @@ -143,6 +145,7 @@ sys_sysio_static_t_LDADD = $(test_wrap_ldadd) sys_sysio_static_t_LDFLAGS = $(test_wrap_ldflags) sys_sysio_static_t_SOURCES = $(test_sysio_sources) + test_statfs_sources = \ sys/statfs_suite.h \ sys/statfs_suite.c \ diff --git a/t/api/api_suite.c b/t/api/api_suite.c index 70a170ff3..9284613c8 100644 --- a/t/api/api_suite.c +++ b/t/api/api_suite.c @@ -12,8 +12,8 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -//#include #include "api_suite.h" +#include /* This is the collection of library API tests. * @@ -49,6 +49,12 @@ int main(int argc, char* argv[]) * functionality or files that were already tested. */ + size_t spill_sz = (size_t)512 * MIB; + char* spill_size_env = getenv("UNIFYFS_LOGIO_SPILL_SIZE"); + if (NULL != spill_size_env) { + spill_sz = (size_t) strtoul(spill_size_env, NULL, 0); + } + rc = api_initialize_test(unifyfs_root, &fshdl); if (rc == UNIFYFS_SUCCESS) { api_create_open_remove_test(unifyfs_root, &fshdl); @@ -62,6 +68,9 @@ int main(int argc, char* argv[]) api_laminate_test(unifyfs_root, &fshdl); + api_storage_test(unifyfs_root, &fshdl, + spill_sz, (spill_sz / 8)); + api_transfer_test(unifyfs_root, tmp_dir, &fshdl, (size_t)64 * MIB, (size_t)4 * MIB); diff --git a/t/api/api_suite.h b/t/api/api_suite.h index 731c24d9f..48d59abca 100644 --- a/t/api/api_suite.h +++ b/t/api/api_suite.h @@ -53,6 +53,14 @@ int api_write_read_sync_stat_test(char* unifyfs_root, int api_laminate_test(char* unifyfs_root, unifyfs_handle* fshdl); + +/* Tests file storage space reuse */ +int api_storage_test(char* unifyfs_root, + unifyfs_handle* fshdl, + size_t filesize, + size_t chksize); + + /* Tests file transfers, both serial and parallel */ int api_transfer_test(char* unifyfs_root, char* tmpdir, diff --git a/t/api/storage-reuse.c b/t/api/storage-reuse.c new file mode 100644 index 000000000..c71357bf8 --- /dev/null +++ b/t/api/storage-reuse.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include +#include +#include +#include +#include + +#include "api_suite.h" + +int api_storage_test(char* unifyfs_root, + unifyfs_handle* fshdl, + size_t filesize, + size_t chksize) +{ + /* Create a random file names at the mountpoint path to test */ + char testfile1[64]; + char testfile2[64]; + testutil_rand_path(testfile1, sizeof(testfile1), unifyfs_root); + testutil_rand_path(testfile2, sizeof(testfile2), unifyfs_root); + + int rc; + + size_t n_chks = filesize / chksize; + size_t extra = filesize % chksize; + if (extra) { + /* test only supports exact multiples of chunk size */ + filesize -= extra; + } + + char* databuf = malloc(chksize); + ok(databuf != NULL, + "%s:%d malloc() of buffer with size=%zu is successful", + __FILE__, __LINE__, chksize); + if (NULL == databuf) { + diag("Initial setup failed"); + return 1; + } + + testutil_lipsum_generate(databuf, chksize, 0); + + diag("Starting API storage tests"); + + /** + * Overview of test workflow: + * (1) create new file (testfile1) + * (2) write and sync testfile1 to use most/all of spillover storage + * (3) stat testfile1 to verify size + * (4) create new file (testfile2) + * (5) 1st attempt to write to testfile2, which should fail due to ENOSPC + * (6) remove testfile1 to free some storage space + * (7) 2nd attempt to write to testfile2, which should succeed now + * that testfile1 storage allocations have been released + * (8) stat testfile2 to verify size + * (9) remove testfile2 + */ + + /* (1) create new file (testfile1) */ + + int t1_flags = 0; + unifyfs_gfid t1_gfid = UNIFYFS_INVALID_GFID; + rc = unifyfs_create(*fshdl, t1_flags, testfile1, &t1_gfid); + ok((rc == UNIFYFS_SUCCESS) && (t1_gfid != UNIFYFS_INVALID_GFID), + "%s:%d unifyfs_create(%s) is successful: gfid=%u rc=%d (%s)", + __FILE__, __LINE__, testfile1, (unsigned int)t1_gfid, + rc, unifyfs_rc_enum_description(rc)); + + /* (2) write and sync testfile1 to use most/all of spillover storage */ + + unifyfs_io_request t1_writes[n_chks + 1]; + memset(t1_writes, 0, sizeof(t1_writes)); + for (size_t i = 0; i < n_chks; i++) { + t1_writes[i].op = UNIFYFS_IOREQ_OP_WRITE; + t1_writes[i].gfid = t1_gfid; + t1_writes[i].nbytes = chksize; + t1_writes[i].offset = (off_t)(i * chksize); + t1_writes[i].user_buf = databuf; + } + t1_writes[n_chks].op = UNIFYFS_IOREQ_OP_SYNC_META; + t1_writes[n_chks].gfid = t1_gfid; + + rc = unifyfs_dispatch_io(*fshdl, n_chks + 1, t1_writes); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks + 1, t1_writes, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + /* (3) stat testfile1 to verify size */ + + unifyfs_status t1_status = {0}; + rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); + /* expected size=filesize since writes have been synced */ + ok((rc == UNIFYFS_SUCCESS) && (t1_status.global_file_size == filesize), + "%s:%d unifyfs_stat(gfid=%u) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, (unsigned int)t1_gfid, + t1_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + /* (4) create new file (testfile2) */ + + int t2_flags = 0; + unifyfs_gfid t2_gfid = UNIFYFS_INVALID_GFID; + rc = unifyfs_create(*fshdl, t2_flags, testfile2, &t2_gfid); + ok((rc == UNIFYFS_SUCCESS) && (t2_gfid != UNIFYFS_INVALID_GFID), + "%s:%d unifyfs_create(%s) is successful: gfid=%u rc=%d (%s)", + __FILE__, __LINE__, testfile2, (unsigned int)t2_gfid, + rc, unifyfs_rc_enum_description(rc)); + + /* (5) 1st attempt to write to testfile2, which should fail due to ENOSPC */ + + unifyfs_io_request t2_writes[n_chks]; + memset(t2_writes, 0, sizeof(t2_writes)); + for (size_t i = 0; i < n_chks; i++) { + t2_writes[i].op = UNIFYFS_IOREQ_OP_WRITE; + t2_writes[i].gfid = t2_gfid; + t2_writes[i].nbytes = chksize; + t2_writes[i].offset = (off_t)(i * chksize); + t2_writes[i].user_buf = databuf; + } + + rc = unifyfs_dispatch_io(*fshdl, n_chks, t2_writes); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks, t2_writes, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + /* check that at least one of the writes hit ENOSPC */ + int no_space_seen = 0; + for (size_t i = 0; i < n_chks; i++) { + if (ENOSPC == t2_writes[i].result.error) { + no_space_seen = 1; + } + } + ok(no_space_seen == 1, + "%s:%d 1st attempt to write to %s hit ENOSPC", + __FILE__, __LINE__, testfile2); + + /* (6) remove testfile1 to free some storage space */ + + rc = unifyfs_remove(*fshdl, testfile1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + /* (7) 2nd attempt to write to testfile2, which should succeed */ + + unifyfs_io_request t2_writes_2[n_chks + 1]; + memset(t2_writes_2, 0, sizeof(t2_writes_2)); + for (size_t i = 0; i < n_chks; i++) { + t2_writes_2[i].op = UNIFYFS_IOREQ_OP_WRITE; + t2_writes_2[i].gfid = t2_gfid; + t2_writes_2[i].nbytes = chksize; + t2_writes_2[i].offset = (off_t)(i * chksize); + t2_writes_2[i].user_buf = databuf; + } + t2_writes_2[n_chks].op = UNIFYFS_IOREQ_OP_SYNC_META; + t2_writes_2[n_chks].gfid = t2_gfid; + + rc = unifyfs_dispatch_io(*fshdl, n_chks + 1, t2_writes_2); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_dispatch_io(%s, OP_WRITE) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + rc = unifyfs_wait_io(*fshdl, n_chks + 1, t2_writes_2, 1); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_wait_io(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile1, rc, unifyfs_rc_enum_description(rc)); + + /* check that the writes were successful */ + int err_count = 0; + for (size_t i = 0; i < n_chks; i++) { + size_t bytes = t2_writes_2[i].nbytes; + off_t off = t2_writes_2[i].offset; + + /* check write operation status */ + int err = t2_writes_2[i].result.error; + size_t cnt = t2_writes_2[i].result.count; + ok((err == 0) && (cnt == bytes), + "%s:%d write(%s, offset=%zu, sz=%zu) is successful: count=%zu," + " rc=%d (%s)", __FILE__, __LINE__, testfile2, (size_t)off, + bytes, cnt, err, unifyfs_rc_enum_description(err)); + if (0 != err) { + err_count++; + } + } + ok(err_count == 0, + "%s:%d 2nd attempt to write to %s was successful", + __FILE__, __LINE__, testfile2); + + /* (8) stat testfile2 to verify size */ + + unifyfs_status t2_status = {0}; + rc = unifyfs_stat(*fshdl, t2_gfid, &t2_status); + /* expected size=filesize since writes have been synced */ + ok((rc == UNIFYFS_SUCCESS) && (t2_status.global_file_size == filesize), + "%s:%d unifyfs_stat(gfid=%u) is successful: filesize=%zu (expected=%zu)," + " rc=%d (%s)", __FILE__, __LINE__, (unsigned int)t2_gfid, + t2_status.global_file_size, filesize, + rc, unifyfs_rc_enum_description(rc)); + + /* (9) remove testfile2 */ + + rc = unifyfs_remove(*fshdl, testfile2); + ok(rc == UNIFYFS_SUCCESS, + "%s:%d unifyfs_remove(%s) is successful: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + + diag("Finished API storage tests"); + + return 0; +} From a9754d75bb52a810518145ac50d926d5eee25996 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Thu, 22 Jul 2021 15:46:44 -0400 Subject: [PATCH 33/81] add heartbeat rpc to detect failed clients --- client/src/margo_client.c | 81 +++++++++++++++++++++++----- client/src/margo_client.h | 4 ++ common/src/arraylist.c | 11 ++-- common/src/unifyfs_client_rpcs.h | 9 ++++ server/src/margo_server.c | 55 +++++++++++++++++++ server/src/margo_server.h | 4 ++ server/src/unifyfs_global.h | 6 +++ server/src/unifyfs_group_rpc.c | 2 +- server/src/unifyfs_request_manager.c | 38 +++++++++++++ server/src/unifyfs_server.c | 64 ++++++++++++++++++++++ 10 files changed, 256 insertions(+), 18 deletions(-) diff --git a/client/src/margo_client.c b/client/src/margo_client.c index ef7ff89ff..3878c9cdc 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -33,6 +33,8 @@ static void register_client_rpcs(client_rpc_context_t* ctx) hg_id_t hgid; + /* client-to-server RPCs */ + #define CLIENT_REGISTER_RPC(name) \ do { \ hgid = MARGO_REGISTER(mid, "unifyfs_" #name "_rpc", \ @@ -42,15 +44,6 @@ static void register_client_rpcs(client_rpc_context_t* ctx) ctx->rpcs.name##_id = hgid; \ } while (0) -#define CLIENT_REGISTER_RPC_HANDLER(name) \ - do { \ - hgid = MARGO_REGISTER(mid, "unifyfs_" #name "_rpc", \ - unifyfs_##name##_in_t, \ - unifyfs_##name##_out_t, \ - unifyfs_##name##_rpc); \ - ctx->rpcs.name##_id = hgid; \ - } while (0) - CLIENT_REGISTER_RPC(attach); CLIENT_REGISTER_RPC(mount); CLIENT_REGISTER_RPC(unmount); @@ -63,11 +56,25 @@ static void register_client_rpcs(client_rpc_context_t* ctx) CLIENT_REGISTER_RPC(laminate); CLIENT_REGISTER_RPC(fsync); CLIENT_REGISTER_RPC(mread); + +#undef CLIENT_REGISTER_RPC + + /* server-to-client RPCs */ + +#define CLIENT_REGISTER_RPC_HANDLER(name) \ + do { \ + hgid = MARGO_REGISTER(mid, "unifyfs_" #name "_rpc", \ + unifyfs_##name##_in_t, \ + unifyfs_##name##_out_t, \ + unifyfs_##name##_rpc); \ + ctx->rpcs.name##_id = hgid; \ + } while (0) + + CLIENT_REGISTER_RPC_HANDLER(heartbeat); CLIENT_REGISTER_RPC_HANDLER(mread_req_data); CLIENT_REGISTER_RPC_HANDLER(mread_req_complete); CLIENT_REGISTER_RPC_HANDLER(transfer_complete); -#undef CLIENT_REGISTER_RPC #undef CLIENT_REGISTER_RPC_HANDLER } @@ -191,6 +198,8 @@ int unifyfs_client_rpc_finalize(void) return UNIFYFS_SUCCESS; } +/*--- Invocation methods for client-to-server RPCs ---*/ + /* create and return a margo handle for given rpc id */ static hg_handle_t create_handle(hg_id_t id) { @@ -847,6 +856,53 @@ int invoke_client_mread_rpc(unifyfs_client* client, return ret; } +/*--- Handler methods for server-to-client RPCs ---*/ + +/* simple heartbeat ping rpc */ +static void unifyfs_heartbeat_rpc(hg_handle_t handle) +{ + int ret; + + /* get input params */ + unifyfs_heartbeat_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* lookup client */ + unifyfs_client* client; + int client_app = (int) in.app_id; + int client_id = (int) in.client_id; + client = unifyfs_find_client(client_app, client_id, NULL); + if (NULL == client) { + /* unknown client */ + ret = EINVAL; + } else if (client->state.is_mounted) { + /* client is still active */ + ret = UNIFYFS_SUCCESS; + } else { + ret = UNIFYFS_FAILURE; + } + margo_free_input(handle, &in); + } + + /* set rpc result status */ + unifyfs_heartbeat_out_t out; + out.ret = ret; + + /* return to caller */ + LOGDBG("responding"); + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_heartbeat_rpc) + /* for client read request identified by mread_id and request index, copy bulk * data to request's user buffer at given byte offset from start of request */ static void unifyfs_mread_req_data_rpc(hg_handle_t handle) @@ -954,9 +1010,8 @@ static void unifyfs_mread_req_data_rpc(hg_handle_t handle) unifyfs_mread_req_data_out_t out; out.ret = ret; - LOGDBG("responding"); - /* return to caller */ + LOGDBG("responding"); hret = margo_respond(handle, &out); if (hret != HG_SUCCESS) { LOGERR("margo_respond() failed"); @@ -1034,7 +1089,7 @@ static void unifyfs_transfer_complete_rpc(hg_handle_t handle) LOGERR("margo_get_input() failed"); ret = UNIFYFS_ERROR_MARGO; } else { - /* lookup client mread request */ + /* lookup client transfer request */ unifyfs_client* client; int client_app = (int) in.app_id; int client_id = (int) in.client_id; diff --git a/client/src/margo_client.h b/client/src/margo_client.h index cdc296a11..0cfa27b3a 100644 --- a/client/src/margo_client.h +++ b/client/src/margo_client.h @@ -24,6 +24,7 @@ #include typedef struct ClientRpcIds { + /* client-to-server */ hg_id_t attach_id; hg_id_t mount_id; hg_id_t unmount_id; @@ -36,6 +37,9 @@ typedef struct ClientRpcIds { hg_id_t laminate_id; hg_id_t fsync_id; hg_id_t mread_id; + + /* server-to-client */ + hg_id_t heartbeat_id; hg_id_t mread_req_data_id; hg_id_t mread_req_complete_id; hg_id_t transfer_complete_id; diff --git a/common/src/arraylist.c b/common/src/arraylist.c index 0892adc15..f9f6f5383 100644 --- a/common/src/arraylist.c +++ b/common/src/arraylist.c @@ -170,12 +170,15 @@ int arraylist_free(arraylist_t* arr) return -1; } - int i; - for (i = 0; i < arr->cap; i++) { - if (arr->elems[i] != NULL) { - free(arr->elems[i]); + if (NULL != arr->elems) { + for (int i = 0; i < arr->cap; i++) { + if (arr->elems[i] != NULL) { + free(arr->elems[i]); + } } + free(arr->elems); } + free(arr); return 0; diff --git a/common/src/unifyfs_client_rpcs.h b/common/src/unifyfs_client_rpcs.h index 3aaa398e8..d13729631 100644 --- a/common/src/unifyfs_client_rpcs.h +++ b/common/src/unifyfs_client_rpcs.h @@ -251,6 +251,15 @@ MERCURY_GEN_PROC(unifyfs_mread_req_complete_in_t, MERCURY_GEN_PROC(unifyfs_mread_req_complete_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(unifyfs_mread_req_complete_rpc) +/* unifyfs_heartbeat_rpc (server => client) + * + * Used to detect when client unexpectedly goes away */ +MERCURY_GEN_PROC(unifyfs_heartbeat_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id))) +MERCURY_GEN_PROC(unifyfs_heartbeat_out_t, ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_heartbeat_rpc) + #ifdef __cplusplus } // extern "C" #endif diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 1d7142233..25a037c84 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -316,6 +316,12 @@ static void register_client_server_rpcs(margo_instance_id mid) unifyfs_mread_rpc); /* register the RPCs we call (and capture assigned hg_id_t) */ + unifyfsd_rpc_context->rpcs.client_heartbeat_id = + MARGO_REGISTER(mid, "unifyfs_heartbeat_rpc", + unifyfs_heartbeat_in_t, + unifyfs_heartbeat_out_t, + NULL); + unifyfsd_rpc_context->rpcs.client_mread_data_id = MARGO_REGISTER(mid, "unifyfs_mread_req_data_rpc", unifyfs_mread_req_data_in_t, @@ -618,6 +624,55 @@ static hg_handle_t create_client_handle(hg_id_t id, return handle; } +/* invokes the heartbeat rpc function */ +int invoke_client_heartbeat_rpc(int app_id, + int client_id) +{ + hg_return_t hret; + + /* check that we have initialized margo */ + if (NULL == unifyfsd_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* fill input struct */ + unifyfs_heartbeat_in_t in; + in.app_id = (int32_t) app_id; + in.client_id = (int32_t) client_id; + + /* get handle to rpc function */ + hg_id_t rpc_id = unifyfsd_rpc_context->rpcs.client_heartbeat_id; + hg_handle_t handle = create_client_handle(rpc_id, app_id, client_id); + + /* call rpc function */ + LOGDBG("invoking the heartbeat rpc function in client"); + double timeout_msec = 500; /* half a second */ + hret = margo_forward_timed(handle, &in, timeout_msec); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward_timed() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; + unifyfs_heartbeat_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ + margo_destroy(handle); + + return ret; +} + /* invokes the client mread request data response rpc function */ int invoke_client_mread_req_data_rpc(int app_id, int client_id, diff --git a/server/src/margo_server.h b/server/src/margo_server.h index 5209d7353..ae9fc7112 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -51,6 +51,7 @@ typedef struct ServerRpcIds { hg_id_t unlink_bcast_id; /* client-server rpcs */ + hg_id_t client_heartbeat_id; hg_id_t client_mread_data_id; hg_id_t client_mread_complete_id; hg_id_t client_transfer_complete_id; @@ -82,6 +83,9 @@ void* pull_margo_bulk_buffer(hg_handle_t rpc_hdl, hg_size_t bulk_sz, hg_bulk_t* local_bulk); +/* invokes the client heartbeat rpc function */ +int invoke_client_heartbeat_rpc(int app_id, int client_id); + /* invokes the client mread request data response rpc function */ int invoke_client_mread_req_data_rpc(int app_id, int client_id, diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index 60efc9525..068c1d3b5 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -131,6 +131,7 @@ typedef struct { struct reqmgr_thrd; + /** * Structure to maintain application client state, including * logio and shared memory contexts, margo rpc address, etc. @@ -187,6 +188,11 @@ unifyfs_rc disconnect_app_client(app_client* clnt); unifyfs_rc cleanup_app_client(app_config* app, app_client* clnt); +/* arraylist to track failed clients */ +arraylist_t* failed_clients; // = NULL +unifyfs_rc add_failed_client(int app_id, int client_id); + + /* publish the pids of all servers to a shared file */ int unifyfs_publish_server_pids(void); diff --git a/server/src/unifyfs_group_rpc.c b/server/src/unifyfs_group_rpc.c index 6dc5340d6..d2f081422 100644 --- a/server/src/unifyfs_group_rpc.c +++ b/server/src/unifyfs_group_rpc.c @@ -379,7 +379,7 @@ static int collective_finish(coll_request* coll_req) if (NULL != coll_req->child_reqs) { margo_request* creq; hg_handle_t* chdl; - /* MJB TODO - use margo_wait_any() instead of our own loop */ + /* TODO: use margo_wait_any() instead of our own loop */ for (i = 0; i < child_count; i++) { chdl = coll_req->child_hdls + i; creq = coll_req->child_reqs + i; diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index a0b363039..4fc161505 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -1312,6 +1312,38 @@ static int rm_process_client_requests(reqmgr_thrd_t* reqmgr) return ret; } +static int rm_heartbeat(reqmgr_thrd_t* reqmgr) +{ + static time_t last_check; // = 0 + static int check_interval = 30; /* seconds */ + + int ret = UNIFYFS_SUCCESS; + + /* send a heartbeat rpc to associated client every 30 seconds */ + time_t now = time(NULL); + if (0 == last_check) { + last_check = now; + } + + time_t elapsed = now - last_check; + if (elapsed >= check_interval) { + last_check = now; + + /* invoke heartbeat rpc */ + LOGDBG("sending heartbeat rpc"); + int app = reqmgr->app_id; + int clid = reqmgr->client_id; + int rc = invoke_client_heartbeat_rpc(app, clid); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + LOGDBG("heartbeat rpc for client[%d:%d] failed", app, clid); + add_failed_client(app, clid); + } + } + + return ret; +} + /* Entry point for request manager thread. One thread is created * for each client process to retrieve remote data and notify the * client when data is ready. @@ -1381,6 +1413,12 @@ void* request_manager_thread(void* arg) thrd_ctrl->waiting_for_work = 0; RM_UNLOCK(thrd_ctrl); + rc = rm_heartbeat(thrd_ctrl); + if (rc != UNIFYFS_SUCCESS) { + /* detected failure of our client, time to exit */ + break; + } + /* bail out if we've been told to exit */ if (thrd_ctrl->exit_flag == 1) { break; diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index daede2f54..f2192fa28 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -248,6 +248,39 @@ static int process_servers_hostfile(const char* hostfile) return (int)UNIFYFS_SUCCESS; } +static void process_client_failures(void) +{ + int num_failed = 0; + arraylist_t* failures = NULL; + + ABT_mutex_lock(app_configs_abt_sync); + if (NULL != failed_clients) { + /* if we have any failed clients, take pointer to the list + * and replace it with a newly allocated list */ + num_failed = arraylist_size(failed_clients); + if (num_failed) { + LOGDBG("processing %d client failures", num_failed); + failures = failed_clients; + failed_clients = arraylist_create(0); + } + } + ABT_mutex_unlock(app_configs_abt_sync); + + if (NULL != failures) { + app_config* app; + app_client* client; + for (int i = 0; i < num_failed; i++) { + /* cleanup client at index */ + client = (app_client*) arraylist_remove(failures, i); + if (NULL != client) { + app = get_application(client->state.app_id); + cleanup_app_client(app, client); + } + } + arraylist_free(failures); + } +} + int main(int argc, char* argv[]) { int rc; @@ -375,6 +408,10 @@ int main(int argc, char* argv[]) exit(1); } + ABT_mutex_lock(app_configs_abt_sync); + failed_clients = arraylist_create(0); + ABT_mutex_unlock(app_configs_abt_sync); + /* launch the service manager (note: must happen after ABT_init) */ LOGDBG("launching service manager thread"); rc = svcmgr_init(); @@ -411,7 +448,11 @@ int main(int argc, char* argv[]) LOGDBG("server[%d] - finished initialization", glb_pmi_rank); while (1) { + /* process any newly failed clients */ + process_client_failures(); + sleep(1); + if (time_to_exit) { LOGDBG("starting service shutdown"); break; @@ -605,6 +646,10 @@ static int unifyfs_exit(void) } } } + if (NULL != failed_clients) { + arraylist_free(failed_clients); + failed_clients = NULL; + } ABT_mutex_unlock(app_configs_abt_sync); /* TODO: notify the service threads to exit */ @@ -986,3 +1031,22 @@ unifyfs_rc cleanup_app_client(app_config* app, app_client* client) return UNIFYFS_SUCCESS; } + +unifyfs_rc add_failed_client(int app_id, int client_id) +{ + app_client* client = get_app_client(app_id, client_id); + if (NULL == client) { + return EINVAL; + } + unifyfs_rc ret = UNIFYFS_SUCCESS; + ABT_mutex_lock(app_configs_abt_sync); + if (NULL != failed_clients) { + int rc = arraylist_add(failed_clients, client); + if (rc == -1) { + LOGERR("failed to add client to failed_clients arraylist"); + ret = UNIFYFS_FAILURE; + } + } + ABT_mutex_unlock(app_configs_abt_sync); + return ret; +} From 3447fb4e718c16a2b621e56a7eb3ea9372913a47 Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Tue, 3 Aug 2021 11:30:46 -0700 Subject: [PATCH 34/81] use ABT after calling margo_init --- server/src/unifyfs_server.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index f2192fa28..9519c5a8f 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -396,8 +396,6 @@ int main(int argc, char* argv[]) } LOGDBG("initializing rpc service"); - ABT_init(argc, argv); - ABT_mutex_create(&app_configs_abt_sync); rc = configurator_bool_val(server_cfg.margo_lazy_connect, &margo_lazy_connect); rc = configurator_bool_val(server_cfg.margo_tcp, @@ -408,6 +406,11 @@ int main(int argc, char* argv[]) exit(1); } + /* We wait to call any ABT functions until after margo_init. + * Margo configures ABT in a particular way, so we defer to + * Margo to call ABT_init. */ + ABT_mutex_create(&app_configs_abt_sync); + ABT_mutex_lock(app_configs_abt_sync); failed_clients = arraylist_create(0); ABT_mutex_unlock(app_configs_abt_sync); From bcabab45c5fd9d5823fd38f75565e575cba8344c Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Mon, 3 May 2021 17:27:57 -0700 Subject: [PATCH 35/81] enable servers to use pmi to get rank and count --- common/src/unifyfs_keyval.c | 120 +++++++++++++++++------- common/src/unifyfs_keyval.h | 7 +- server/src/Makefile.am | 10 ++ server/src/margo_server.c | 165 ++++++++++++++++++--------------- server/src/unifyfs_global.h | 1 - server/src/unifyfs_server.c | 176 +++++++++++++++++++----------------- 6 files changed, 287 insertions(+), 192 deletions(-) diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index 4975a4794..151661d72 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -37,7 +37,6 @@ const char* const key_unifyfsd_socket = "unifyfsd.socket"; const char* const key_unifyfsd_margo_shm = "unifyfsd.margo-shm"; const char* const key_unifyfsd_margo_svr = "unifyfsd.margo-svr"; -const char* const key_unifyfsd_pmi_rank = "unifyfsd.pmi-rank"; // key-value store state static int kv_initialized; // = 0 @@ -58,6 +57,9 @@ static size_t kv_max_vallen; // = 0 # define UNIFYFS_MAX_KV_VALLEN 4096 #endif +/* PMI information */ +int glb_pmi_rank = -1; +int glb_pmi_size; /* = 0 */ //--------------------- PMI2 K-V Store --------------------- #if defined(USE_PMI2) @@ -127,12 +129,17 @@ static void unifyfs_pmi2_errstr(int rc) } // initialize PMI2 -static int unifyfs_pmi2_init(void) +int unifyfs_pmi2_init(void) { int nprocs, rank, rc, val, len, found; int pmi_world_rank = -1; int pmi_world_nprocs = -1; + /* return success if we're already initialized */ + if (pmi2_initialized) { + return (int)UNIFYFS_SUCCESS; + } + kv_max_keylen = PMI2_MAX_KEYLEN; kv_max_vallen = PMI2_MAX_VALLEN; @@ -191,6 +198,9 @@ static int unifyfs_pmi2_init(void) kv_myrank = pmi_world_rank; kv_nranks = pmi_world_nprocs; + glb_pmi_rank = kv_myrank; + glb_pmi_size = kv_nranks; + LOGDBG("PMI2 Job Id: %s, Rank: %d of %d, hasNameServer=%d", pmi_jobid, kv_myrank, kv_nranks, pmi2_has_nameserv); @@ -239,6 +249,26 @@ static int unifyfs_pmi2_lookup(const char* key, LOGERR("PMI2_KVS_Get(%s) failed: %s", key, pmi2_errstr); return (int)UNIFYFS_ERROR_PMI; } + + // HACK: replace '!' with ';' for SLURM PMI2 + // This assumes the value does not actually use "!" + // + // At least one version of SLURM PMI2 seems to use ";" + // characters to separate key/value pairs, so the following: + // + // PMI2_KVS_Put("unifyfs.margo-svr", "ofi+tcp;ofi_rxm://ip:port") + // + // leads to an error like: + // + // slurmstepd: error: mpi/pmi2: no value for key ;ofi_rxm://ip:port; in req + char* p = pmi2_val; + while (*p != '\0') { + if (*p == '!') { + *p = ';'; + } + p++; + } + *oval = strdup(pmi2_val); return (int)UNIFYFS_SUCCESS; } @@ -257,6 +287,26 @@ static int unifyfs_pmi2_publish(const char* key, strncpy(pmi2_key, key, sizeof(pmi2_key)); strncpy(pmi2_val, val, sizeof(pmi2_val)); + + // HACK: replace ';' with '!' for SLURM PMI2 + // This assumes the value does not actually use "!" + // + // At least one version of SLURM PMI2 seems to use ";" + // characters to separate key/value pairs, so the following: + // + // PMI2_KVS_Put("unifyfs.margo-svr", "ofi+tcp;ofi_rxm://ip:port") + // + // leads to an error like: + // + // slurmstepd: error: mpi/pmi2: no value for key ;ofi_rxm://ip:port; in req + char* p = pmi2_val; + while (*p != '\0') { + if (*p == ';') { + *p = '!'; + } + p++; + } + rc = PMI2_KVS_Put(pmi2_key, pmi2_val); if (rc != PMI2_SUCCESS) { unifyfs_pmi2_errstr(rc); @@ -294,7 +344,7 @@ static pmix_proc_t pmix_myproc; #endif // initialize PMIx -static int unifyfs_pmix_init(void) +int unifyfs_pmix_init(void) { int rc; size_t pmix_univ_nprocs; @@ -302,6 +352,11 @@ static int unifyfs_pmix_init(void) pmix_value_t* valp = &value; pmix_proc_t proc; + /* return success if we're already initialized */ + if (pmix_initialized) { + return (int)UNIFYFS_SUCCESS; + } + /* init PMIx */ PMIX_PROC_CONSTRUCT(&pmix_myproc); rc = PMIx_Init(&pmix_myproc, NULL, 0); @@ -328,6 +383,9 @@ static int unifyfs_pmix_init(void) kv_myrank = pmix_myproc.rank; kv_nranks = (int)pmix_univ_nprocs; + glb_pmi_rank = kv_myrank; + glb_pmi_size = kv_nranks; + LOGDBG("PMIX Job Id: %s, Rank: %d of %d", pmix_myproc.nspace, kv_myrank, kv_nranks); @@ -490,7 +548,6 @@ static int unifyfs_fskv_init(unifyfs_cfg_t* cfg) int rc, err; struct stat s; - if (NULL == cfg) { LOGERR("NULL config"); return EINVAL; @@ -715,6 +772,27 @@ static int unifyfs_fskv_lookup_local(const char* key, return (int)UNIFYFS_SUCCESS; } +// publish a key-value pair +static int unifyfs_fskv_publish_local(const char* key, + const char* val) +{ + FILE* kvf; + char kvfile[UNIFYFS_MAX_FILENAME]; + + scnprintf(kvfile, sizeof(kvfile), "%s/%s", + localfs_kvdir, key); + kvf = fopen(kvfile, "w"); + if (NULL == kvf) { + LOGERR("failed to create kvstore entry %s", kvfile); + return (int)UNIFYFS_ERROR_KEYVAL; + } + fprintf(kvf, "%s\n", val); + fclose(kvf); + + return (int)UNIFYFS_SUCCESS; +} + +#if (!defined(USE_PMI2)) && (!defined(USE_PMIX)) static int unifyfs_fskv_lookup_remote(int rank, const char* key, char** oval) @@ -748,26 +826,6 @@ static int unifyfs_fskv_lookup_remote(int rank, return (int)UNIFYFS_SUCCESS; } -// publish a key-value pair -static int unifyfs_fskv_publish_local(const char* key, - const char* val) -{ - FILE* kvf; - char kvfile[UNIFYFS_MAX_FILENAME]; - - scnprintf(kvfile, sizeof(kvfile), "%s/%s", - localfs_kvdir, key); - kvf = fopen(kvfile, "w"); - if (NULL == kvf) { - LOGERR("failed to create kvstore entry %s", kvfile); - return (int)UNIFYFS_ERROR_KEYVAL; - } - fprintf(kvf, "%s\n", val); - fclose(kvf); - - return (int)UNIFYFS_SUCCESS; -} - static int unifyfs_fskv_publish_remote(const char* key, const char* val) { @@ -791,7 +849,6 @@ static int unifyfs_fskv_publish_remote(const char* key, return (int)UNIFYFS_SUCCESS; } -#if (!defined(USE_PMI2)) && (!defined(USE_PMIX)) static int unifyfs_fskv_fence(void) { if (!have_sharedfs_kvstore) { @@ -840,6 +897,7 @@ int unifyfs_keyval_init(unifyfs_cfg_t* cfg, kv_nranks = *nranks; } #endif + // NOTE: do this after getting rank/n_ranks info rc = unifyfs_fskv_init(cfg); if (rc != (int)UNIFYFS_SUCCESS) { @@ -855,6 +913,7 @@ int unifyfs_keyval_init(unifyfs_cfg_t* cfg, if (NULL != nranks) { *nranks = kv_nranks; } + return (int)UNIFYFS_SUCCESS; } @@ -959,11 +1018,8 @@ int unifyfs_keyval_lookup_remote(int rank, #elif defined(USE_PMI2) rc = unifyfs_pmi2_lookup(rank_key, oval); #else - rc = (int)UNIFYFS_FAILURE; + rc = unifyfs_fskv_lookup_remote(rank, key, oval); #endif - if (rc != (int)UNIFYFS_SUCCESS) { - rc = unifyfs_fskv_lookup_remote(rank, key, oval); - } if (rc != (int)UNIFYFS_SUCCESS) { LOGERR("remote keyval lookup for '%s' failed", key); } @@ -1043,12 +1099,8 @@ int unifyfs_keyval_publish_remote(const char* key, #elif defined(USE_PMI2) rc = unifyfs_pmi2_publish(rank_key, val); #else - rc = (int)UNIFYFS_FAILURE; + rc = unifyfs_fskv_publish_remote(key, val); #endif - if (rc != (int)UNIFYFS_SUCCESS) { - rc = unifyfs_fskv_publish_remote(key, val); - } - if (rc != (int)UNIFYFS_SUCCESS) { LOGERR("remote keyval publish for '%s' failed", key); } else { diff --git a/common/src/unifyfs_keyval.h b/common/src/unifyfs_keyval.h index d56e884cd..27d4b96e9 100644 --- a/common/src/unifyfs_keyval.h +++ b/common/src/unifyfs_keyval.h @@ -21,11 +21,16 @@ extern "C" { #endif +extern int glb_pmi_rank; +extern int glb_pmi_size; + +int unifyfs_pmix_init(void); +int unifyfs_pmi2_init(void); + // keys we use extern const char* const key_unifyfsd_socket; // server domain socket path extern const char* const key_unifyfsd_margo_shm; // client-server margo address extern const char* const key_unifyfsd_margo_svr; // server-server margo address -extern const char* const key_unifyfsd_pmi_rank; // server-server pmi rank // initialize key-value store int unifyfs_keyval_init(unifyfs_cfg_t* cfg, diff --git a/server/src/Makefile.am b/server/src/Makefile.am index ca5c62191..634503de2 100644 --- a/server/src/Makefile.am +++ b/server/src/Makefile.am @@ -72,6 +72,16 @@ else # ! USE_MDHIM endif # USE_MDHIM +if USE_PMIX + OPT_C_FLAGS += -DUSE_PMIX + OPT_LIBS += -lpmix +endif + +if USE_PMI2 + OPT_C_FLAGS += -DUSE_PMI2 + OPT_LIBS += -lpmi2 +endif + unifyfsd_CFLAGS = $(AM_CFLAGS) $(UNIFYFS_COMMON_FLAGS) $(OPT_C_FLAGS) unifyfsd_LDFLAGS = $(OPT_LD_FLAGS) unifyfsd_LDADD = $(UNIFYFS_COMMON_LIBS) $(OPT_LIBS) diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 25a037c84..0065fbd2b 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -32,6 +32,10 @@ int margo_client_server_pool_sz = 4; int margo_server_server_pool_sz = 4; int margo_use_progress_thread = 1; +// records pmi rank, server address string, and server address +// for each server for use in server-to-server rpcs +static server_info_t* server_infos; // array of server_info_t + #if defined(NA_HAS_SM) static const char* PROTOCOL_MARGO_SHM = "na+sm"; #else @@ -58,30 +62,51 @@ static const char* PROTOCOL_MARGO_OFI_TCP; static const char* PROTOCOL_MARGO_OFI_RMA; #endif -/* setup_remote_target - Initializes the server-server margo target */ -static margo_instance_id setup_remote_target(void) +/* Given a margo instance ID (mid), return its corresponding + * address as a newly allocated string to be freed by caller. + * Returns NULL on error. */ +static char* get_margo_addr_str(margo_instance_id mid) { - /* initialize margo */ - hg_return_t hret; + /* get margo address for given instance */ hg_addr_t addr_self; + hg_return_t hret = margo_addr_self(mid, &addr_self); + if (hret != HG_SUCCESS) { + LOGERR("margo_addr_self() failed"); + return NULL; + } + + /* convert margo address to a string */ char self_string[128]; hg_size_t self_string_sz = sizeof(self_string); - margo_instance_id mid; - const char* margo_protocol; + hret = margo_addr_to_string(mid, + self_string, &self_string_sz, addr_self); + if (hret != HG_SUCCESS) { + LOGERR("margo_addr_to_string() failed"); + margo_addr_free(mid, addr_self); + return NULL; + } + margo_addr_free(mid, addr_self); + + /* return address in newly allocated string */ + char* addr = strdup(self_string); + return addr; +} +/* setup_remote_target - Initializes the server-server margo target */ +static margo_instance_id setup_remote_target(void) +{ /* by default we try to use ofi */ - margo_protocol = margo_use_tcp ? + const char* margo_protocol = margo_use_tcp ? PROTOCOL_MARGO_OFI_TCP : PROTOCOL_MARGO_OFI_RMA; - - /* when ofi is not available, fallback to using bmi */ if (!margo_protocol) { + /* when ofi is not available, fallback to using bmi */ LOGWARN("OFI is not available, using BMI for margo rpc"); margo_protocol = PROTOCOL_MARGO_BMI_TCP; } - mid = margo_init(margo_protocol, MARGO_SERVER_MODE, - margo_use_progress_thread, - margo_server_server_pool_sz); + /* initialize margo */ + margo_instance_id mid = margo_init(margo_protocol, MARGO_SERVER_MODE, + margo_use_progress_thread, margo_server_server_pool_sz); if (mid == MARGO_INSTANCE_NULL) { LOGERR("margo_init(%s, SERVER_MODE, %d, %d) failed", margo_protocol, margo_use_progress_thread, @@ -101,28 +126,20 @@ static margo_instance_id setup_remote_target(void) } } - /* figure out what address this server is listening on */ - hret = margo_addr_self(mid, &addr_self); - if (hret != HG_SUCCESS) { - LOGERR("margo_addr_self() failed"); - margo_finalize(mid); - return MARGO_INSTANCE_NULL; - } - hret = margo_addr_to_string(mid, - self_string, &self_string_sz, - addr_self); - if (hret != HG_SUCCESS) { - LOGERR("margo_addr_to_string() failed"); - margo_addr_free(mid, addr_self); + /* get our address for server-server rpcs */ + char* self_string = get_margo_addr_str(mid); + if (NULL == self_string) { + LOGERR("invalid value to publish server-server margo rpc address"); margo_finalize(mid); return MARGO_INSTANCE_NULL; } LOGINFO("margo RPC server: %s", self_string); - margo_addr_free(mid, addr_self); /* publish rpc address of server for remote servers */ rpc_publish_remote_server_addr(self_string); + free(self_string); + return mid; } @@ -225,12 +242,7 @@ static margo_instance_id setup_local_target(void) { /* initialize margo */ const char* margo_protocol = PROTOCOL_MARGO_SHM; - hg_return_t hret; - hg_addr_t addr_self; - char self_string[128]; - hg_size_t self_string_sz = sizeof(self_string); - margo_instance_id mid; - mid = margo_init(margo_protocol, MARGO_SERVER_MODE, + margo_instance_id mid = margo_init(margo_protocol, MARGO_SERVER_MODE, margo_use_progress_thread, margo_client_server_pool_sz); if (mid == MARGO_INSTANCE_NULL) { LOGERR("margo_init(%s, SERVER_MODE, %d, %d) failed", margo_protocol, @@ -239,27 +251,19 @@ static margo_instance_id setup_local_target(void) } /* figure out what address this server is listening on */ - hret = margo_addr_self(mid, &addr_self); - if (hret != HG_SUCCESS) { + char* self_string = get_margo_addr_str(mid); + if (NULL == self_string) { LOGERR("margo_addr_self() failed"); margo_finalize(mid); return MARGO_INSTANCE_NULL; } - hret = margo_addr_to_string(mid, - self_string, &self_string_sz, - addr_self); - if (hret != HG_SUCCESS) { - LOGERR("margo_addr_to_string() failed"); - margo_addr_free(mid, addr_self); - margo_finalize(mid); - return MARGO_INSTANCE_NULL; - } LOGINFO("shared-memory margo RPC server: %s", self_string); - margo_addr_free(mid, addr_self); /* publish rpc address of server for local clients */ rpc_publish_local_server_addr(self_string); + free(self_string); + return mid; } @@ -425,19 +429,21 @@ int margo_server_rpc_finalize(void) /* free global server addresses */ for (int i = 0; i < glb_num_servers; i++) { - if (glb_servers[i].margo_svr_addr != HG_ADDR_NULL) { - margo_addr_free(ctx->svr_mid, glb_servers[i].margo_svr_addr); - glb_servers[i].margo_svr_addr = HG_ADDR_NULL; + server_info_t* server = &server_infos[i]; + if (server->margo_svr_addr != HG_ADDR_NULL) { + margo_addr_free(ctx->svr_mid, server->margo_svr_addr); + server->margo_svr_addr = HG_ADDR_NULL; } - if (NULL != glb_servers[i].margo_svr_addr_str) { - free(glb_servers[i].margo_svr_addr_str); - glb_servers[i].margo_svr_addr_str = NULL; + if (NULL != server->margo_svr_addr_str) { + free(server->margo_svr_addr_str); + server->margo_svr_addr_str = NULL; } } /* shut down margo */ LOGDBG("finalizing server-server margo"); margo_finalize(ctx->svr_mid); + /* NOTE: 2nd call to margo_finalize() sometimes crashes - Margo bug? */ LOGDBG("finalizing client-server margo"); margo_finalize(ctx->shm_mid); @@ -454,18 +460,21 @@ int margo_connect_server(int rank) assert(rank < glb_num_servers); int ret = UNIFYFS_SUCCESS; + + server_info_t* server = &server_infos[rank]; + + /* lookup rpc address for this server */ char* margo_addr_str = rpc_lookup_remote_server_addr(rank); if (NULL == margo_addr_str) { - LOGERR("server index=%d - margo server lookup failed", rank); - ret = UNIFYFS_ERROR_KEYVAL; - return ret; + LOGERR("server index=%zu - margo server lookup failed", rank); + return (int)UNIFYFS_ERROR_KEYVAL; } - glb_servers[rank].margo_svr_addr_str = margo_addr_str; LOGDBG("server rank=%d, margo_addr=%s", rank, margo_addr_str); + server->margo_svr_addr_str = margo_addr_str; hg_return_t hret = margo_addr_lookup(unifyfsd_rpc_context->svr_mid, - glb_servers[rank].margo_svr_addr_str, - &(glb_servers[rank].margo_svr_addr)); + server->margo_svr_addr_str, + &(server->margo_svr_addr)); if (hret != HG_SUCCESS) { LOGERR("server index=%zu - margo_addr_lookup(%s) failed", rank, margo_addr_str); @@ -477,30 +486,43 @@ int margo_connect_server(int rank) /* margo_connect_servers * - * Using address strings found in glb_servers, resolve - * each peer server's margo address. + * Gather pmi rank and margo address string for all servers, + * and optionally connect to each one. */ int margo_connect_servers(void) { int rc; - int ret = UNIFYFS_SUCCESS; - int i; - // block until a margo_svr key pair published by all servers + int ret = (int)UNIFYFS_SUCCESS; + + /* block until all servers have published their address */ rc = unifyfs_keyval_fence_remote(); if ((int)UNIFYFS_SUCCESS != rc) { LOGERR("keyval fence on margo_svr key failed"); - ret = UNIFYFS_ERROR_KEYVAL; - return ret; + return (int)UNIFYFS_ERROR_KEYVAL; } - for (i = 0; i < (int)glb_num_servers; i++) { - glb_servers[i].pmi_rank = i; - glb_servers[i].margo_svr_addr = HG_ADDR_NULL; - glb_servers[i].margo_svr_addr_str = NULL; + /* allocate array of structs to record address for each server */ + server_infos = (server_info_t*) calloc(glb_num_servers, + sizeof(server_info_t)); + if (NULL == server_infos) { + LOGERR("failed to allocate server_info array"); + return ENOMEM; + } + + /* lookup address string for each server, and optionally connect */ + size_t i; + for (i = 0; i < glb_num_servers; i++) { + /* record values on struct for this server */ + server_info_t* server = &server_infos[i]; + server->pmi_rank = i; + server->margo_svr_addr = HG_ADDR_NULL; + server->margo_svr_addr_str = NULL; + + /* connect to each server now if not using lazy connect */ if (!margo_lazy_connect) { rc = margo_connect_server(i); - if (rc != UNIFYFS_SUCCESS) { + if (UNIFYFS_SUCCESS != rc) { ret = rc; } } @@ -512,11 +534,12 @@ int margo_connect_servers(void) hg_addr_t get_margo_server_address(int rank) { assert(rank < glb_num_servers); - hg_addr_t addr = glb_servers[rank].margo_svr_addr; + server_info_t* server = &server_infos[rank]; + hg_addr_t addr = server->margo_svr_addr; if ((HG_ADDR_NULL == addr) && margo_lazy_connect) { int rc = margo_connect_server(rank); - if (rc == UNIFYFS_SUCCESS) { - addr = glb_servers[rank].margo_svr_addr; + if (UNIFYFS_SUCCESS == rc) { + addr = server->margo_svr_addr; } } return addr; diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index 068c1d3b5..e3abb081d 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -74,7 +74,6 @@ typedef struct { int pmi_rank; } server_info_t; -extern server_info_t* glb_servers; /* array of server info structs */ extern size_t glb_num_servers; /* number of entries in glb_servers array */ extern struct unifyfs_inode_tree* global_inode_tree; /* global inode tree */ diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index 9519c5a8f..edbc63380 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -45,16 +45,11 @@ // margo rpcs #include "margo_server.h" -/* PMI information */ -int glb_pmi_rank; /* = 0 */ -int glb_pmi_size = 1; // for standalone server tests int server_pid; char glb_host[UNIFYFS_MAX_HOSTNAME]; -size_t glb_host_ndx; // index of localhost in glb_servers size_t glb_num_servers; // size of glb_servers array -server_info_t* glb_servers; // array of server_info_t unifyfs_cfg_t server_cfg; @@ -154,74 +149,32 @@ void exit_request(int sig) } } -#if defined(UNIFYFSD_USE_MPI) -static void init_MPI(int* argc, char*** argv) -{ - int rc, provided; - rc = MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); - if (rc != MPI_SUCCESS) { - exit(1); - } - - rc = MPI_Comm_rank(MPI_COMM_WORLD, &glb_pmi_rank); - if (rc != MPI_SUCCESS) { - exit(1); - } - - rc = MPI_Comm_size(MPI_COMM_WORLD, &glb_pmi_size); - if (rc != MPI_SUCCESS) { - exit(1); - } -} - -static void fini_MPI(void) -{ - MPI_Finalize(); -} -#endif // UNIFYFSD_USE_MPI - -static int allocate_servers(size_t n_servers) -{ - glb_num_servers = n_servers; - glb_servers = (server_info_t*) calloc(n_servers, sizeof(server_info_t)); - if (NULL == glb_servers) { - LOGERR("failed to allocate server_info array"); - return ENOMEM; - } - return (int)UNIFYFS_SUCCESS; -} - static int process_servers_hostfile(const char* hostfile) { - int rc; - size_t i, cnt; - FILE* fp = NULL; - char hostbuf[UNIFYFS_MAX_HOSTNAME+1]; - if (NULL == hostfile) { return EINVAL; } - fp = fopen(hostfile, "r"); + + FILE* fp = fopen(hostfile, "r"); if (!fp) { LOGERR("failed to open hostfile %s", hostfile); return (int)UNIFYFS_FAILURE; } // scan first line: number of hosts - rc = fscanf(fp, "%zu\n", &cnt); + size_t cnt = 0; + int rc = fscanf(fp, "%zu\n", &cnt); if (1 != rc) { LOGERR("failed to scan hostfile host count"); fclose(fp); return (int)UNIFYFS_FAILURE; } - rc = allocate_servers(cnt); - if ((int)UNIFYFS_SUCCESS != rc) { - fclose(fp); - return (int)UNIFYFS_FAILURE; - } - // scan host lines + // scan host lines to find index of host of this process + size_t i; + size_t ndx = 0; for (i = 0; i < cnt; i++) { + char hostbuf[UNIFYFS_MAX_HOSTNAME + 1]; memset(hostbuf, 0, sizeof(hostbuf)); rc = fscanf(fp, "%s\n", hostbuf); if (1 != rc) { @@ -230,21 +183,60 @@ static int process_servers_hostfile(const char* hostfile) return (int)UNIFYFS_FAILURE; } + // check whether this line matches our hostname // NOTE: following assumes one server per host if (0 == strcmp(glb_host, hostbuf)) { - glb_host_ndx = (int)i; - LOGDBG("found myself at hostfile index=%zu, pmi_rank=%d", - glb_host_ndx, glb_pmi_rank); + ndx = (int)i; + LOGDBG("found myself at hostfile index=%zu", ndx); } } fclose(fp); - if (glb_pmi_size < cnt) { - glb_pmi_rank = (int)glb_host_ndx; - glb_pmi_size = (int)cnt; - LOGDBG("set pmi rank to host index %d", glb_pmi_rank); + glb_pmi_rank = (int)ndx; + glb_pmi_size = (int)cnt; + + LOGDBG("set pmi rank to host index %d", glb_pmi_rank); + + return (int)UNIFYFS_SUCCESS; +} + +/* Ensure that glb_pmi_rank, glb_pmi_size, and glb_num_server values are set. */ +static int get_server_rank_and_size(const unifyfs_cfg_t* cfg) +{ + int rc; + +#if defined(UNIFYFSD_USE_MPI) + /* use rank and size of MPI communicator */ + rc = MPI_Comm_rank(MPI_COMM_WORLD, &glb_pmi_rank); + if (rc != MPI_SUCCESS) { + exit(1); + } + + rc = MPI_Comm_size(MPI_COMM_WORLD, &glb_pmi_size); + if (rc != MPI_SUCCESS) { + exit(1); + } +#elif !defined(USE_PMIX) && !defined(USE_PMI2) + /* if not using PMIX or PMI2, + * initialize rank/size to assume a singleton job */ + glb_pmi_rank = 0; + glb_pmi_size = 1; +#endif + + /* If the user has specified a hostfile, + * extract glb_pmi_rank and glb_pmi_size from there + * overriding any settings from MPI/PMI. */ + if (NULL != cfg->server_hostfile) { + rc = process_servers_hostfile(cfg->server_hostfile); + if (rc != (int)UNIFYFS_SUCCESS) { + LOGERR("failed to gather server information"); + exit(1); + } } + /* TODO: can we just use glb_pmi_size everywhere instead? */ + glb_num_servers = glb_pmi_size; + return (int)UNIFYFS_SUCCESS; } @@ -287,7 +279,6 @@ int main(int argc, char* argv[]) int kv_rank, kv_nranks; bool daemon = true; struct sigaction sa; - char rank_str[16] = {0}; char dbg_fname[UNIFYFS_MAX_FILENAME] = {0}; rc = unifyfs_config_init(&server_cfg, argc, argv, 0, NULL); @@ -343,12 +334,10 @@ int main(int argc, char* argv[]) // initialize empty app_configs[] memset(app_configs, 0, sizeof(app_configs)); -#if defined(UNIFYFSD_USE_MPI) - init_MPI(&argc, &argv); -#endif + // record hostname of this server in global variable + gethostname(glb_host, sizeof(glb_host)); // start logging - gethostname(glb_host, sizeof(glb_host)); snprintf(dbg_fname, sizeof(dbg_fname), "%s/%s.%s", server_cfg.log_dir, server_cfg.log_file, glb_host); rc = unifyfs_log_open(dbg_fname); @@ -359,12 +348,40 @@ int main(int argc, char* argv[]) // print config unifyfs_config_print(&server_cfg, unifyfs_log_stream); - if (NULL != server_cfg.server_hostfile) { - rc = process_servers_hostfile(server_cfg.server_hostfile); - if (rc != (int)UNIFYFS_SUCCESS) { - LOGERR("failed to gather server information"); - exit(1); - } + // initialize MPI and PMI if we're using them +#if defined(UNIFYFSD_USE_MPI) + int provided; + rc = MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &provided); + if (rc != MPI_SUCCESS) { + LOGERR("failed to initialize MPI"); + exit(1); + } +#elif defined(USE_PMIX) + rc = unifyfs_pmix_init(); + if (rc != (int)UNIFYFS_SUCCESS) { + LOGERR("failed to initialize PMIX"); + exit(1); + } +#elif defined(USE_PMI2) + rc = unifyfs_pmi2_init(); + if (rc != (int)UNIFYFS_SUCCESS) { + LOGERR("failed to initialize PMI2"); + exit(1); + } +#endif + + /* get rank of this server process and number of servers, + * set glb_pmi_rank and glb_pmi_size */ + rc = get_server_rank_and_size(&server_cfg); + if (rc != (int)UNIFYFS_SUCCESS) { + LOGERR("failed to get server rank and size"); + exit(1); + } + + /* bail out if we don't have our server rank and group size defined */ + if (glb_pmi_size <= 0) { + LOGERR("failed to read rank and size of server group"); + exit(1); } kv_rank = glb_pmi_rank; @@ -384,17 +401,6 @@ int main(int argc, char* argv[]) glb_pmi_size = kv_nranks; } - snprintf(rank_str, sizeof(rank_str), "%d", glb_pmi_rank); - rc = unifyfs_keyval_publish_remote(key_unifyfsd_pmi_rank, rank_str); - if (rc != (int)UNIFYFS_SUCCESS) { - exit(1); - } - - if (NULL == server_cfg.server_hostfile) { - //glb_svr_rank = kv_rank; - rc = allocate_servers((size_t)kv_nranks); - } - LOGDBG("initializing rpc service"); rc = configurator_bool_val(server_cfg.margo_lazy_connect, &margo_lazy_connect); @@ -674,7 +680,7 @@ static int unifyfs_exit(void) #if defined(UNIFYFSD_USE_MPI) LOGDBG("finalizing MPI"); - fini_MPI(); + MPI_Finalize(); #endif LOGDBG("all done!"); From 7a7de94270dae53508682490d27f0d1e0fda5c2a Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Thu, 5 Aug 2021 09:35:20 -0400 Subject: [PATCH 36/81] add unifyfs_get_config() to library API Adds a programmatic way for library API clients to query the configuration settings for an initialized unifyfs_handle. The unit tests for the library API have been updated to exercise the new method. Also includes: * remove unused UNIFYFS_CFG_MULTI and UNIFYFS_CFG_MULTI_CLI macros from unifyfs_configurator.[ch] * turn off unifyfs.daemonize config setting by default, since it causes issues on many types of systems but doesn't really offer any benefit TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.c" TEST_CHECKPATCH_SKIP_FILES+=",common/src/unifyfs_configurator.h" --- client/src/unifyfs_api.c | 27 ++++ client/src/unifyfs_api.h | 13 ++ common/src/unifyfs_configurator.c | 255 ++++++++++-------------------- common/src/unifyfs_configurator.h | 17 +- docs/configuration.rst | 2 +- t/api/api_suite.c | 2 + t/api/api_suite.h | 4 + t/api/init-fini.c | 34 +++- 8 files changed, 162 insertions(+), 192 deletions(-) diff --git a/client/src/unifyfs_api.c b/client/src/unifyfs_api.c index bf234ae69..16a1c269c 100644 --- a/client/src/unifyfs_api.c +++ b/client/src/unifyfs_api.c @@ -297,3 +297,30 @@ unifyfs_rc unifyfs_finalize(unifyfs_handle fshdl) return ret; } + +/* Retrieve client's UnifyFS configuration for the given handle. */ +unifyfs_rc unifyfs_get_config(unifyfs_handle fshdl, + int* n_opts, + unifyfs_cfg_option** options) +{ + if ((UNIFYFS_INVALID_HANDLE == fshdl) || + (NULL == n_opts) || + (NULL == options)) { + return EINVAL; + } + unifyfs_client* client = fshdl; + + int num_options; + unifyfs_cfg_option* options_array; + int ret = unifyfs_config_get_options(&(client->cfg), + &num_options, + &options_array); + if (UNIFYFS_SUCCESS == ret) { + *n_opts = num_options; + *options = options_array; + } else { + *n_opts = 0; + *options = NULL; + } + return ret; +} diff --git a/client/src/unifyfs_api.h b/client/src/unifyfs_api.h index b86cf78ab..45d955218 100644 --- a/client/src/unifyfs_api.h +++ b/client/src/unifyfs_api.h @@ -165,6 +165,19 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, */ unifyfs_rc unifyfs_finalize(unifyfs_handle fshdl); +/* + * Retrieve client's UnifyFS configuration for the given handle. + * + * @param[in] fshdl Client file system handle + * @param[out] n_opts pointer to size of options array + * @param[out] options pointer to array of configuration options + * + * @return UnifyFS success or failure code + */ +unifyfs_rc unifyfs_get_config(unifyfs_handle fshdl, + int* n_opts, + unifyfs_cfg_option** options); + /* * Create and open a new file in UnifyFS. * diff --git a/common/src/unifyfs_configurator.c b/common/src/unifyfs_configurator.c index 1aa6cd6a3..39d729da7 100644 --- a/common/src/unifyfs_configurator.c +++ b/common/src/unifyfs_configurator.c @@ -133,29 +133,9 @@ int unifyfs_config_fini(unifyfs_cfg_t* cfg) cfg->sec##_##key = NULL; \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ - for (u = 0; u < me; u++) { \ - if (cfg->sec##_##key[u] != NULL) { \ - free(cfg->sec##_##key[u]); \ - cfg->sec##_##key[u] = NULL; \ - } \ - } \ - cfg->n_##sec##_##key = 0; - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - for (u = 0; u < me; u++) { \ - if (cfg->sec##_##key[u] != NULL) { \ - free(cfg->sec##_##key[u]); \ - cfg->sec##_##key[u] = NULL; \ - } \ - } \ - cfg->n_##sec##_##key = 0; - - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI return (int)UNIFYFS_SUCCESS; } @@ -184,29 +164,9 @@ void unifyfs_config_print(unifyfs_cfg_t* cfg, fprintf(fp, "%s\n", msg); \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ - for (u = 0; u < me; u++) { \ - if (cfg->sec##_##key[u] != NULL) { \ - snprintf(msg, sizeof(msg), "UNIFYFS CONFIG: %s.%s[%u] = %s", \ - #sec, #key, u+1, cfg->sec##_##key[u]); \ - fprintf(fp, "%s\n", msg); \ - } \ - } - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - for (u = 0; u < me; u++) { \ - if (cfg->sec##_##key[u] != NULL) { \ - snprintf(msg, sizeof(msg), "UNIFYFS CONFIG: %s.%s[%u] = %s", \ - #sec, #key, u+1, cfg->sec##_##key[u]); \ - fprintf(fp, "%s\n", msg); \ - } \ - } - - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI fflush(fp); } @@ -239,35 +199,9 @@ void unifyfs_config_print_ini(unifyfs_cfg_t* cfg, last_sec = curr_sec; \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ - for (u = 0; u < me; u++) { \ - if (cfg->sec##_##key[u] != NULL) { \ - curr_sec = #sec; \ - if ((last_sec == NULL) || (strcmp(curr_sec, last_sec) != 0)) \ - fprintf(inifp, "\n[%s]\n", curr_sec); \ - fprintf(inifp, "%s = %s ; (instance %u)\n", \ - #key, cfg->sec##_##key[u], u+1); \ - last_sec = curr_sec; \ - } \ - } - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - for (u = 0; u < me; u++) { \ - if (cfg->sec##_##key[u] != NULL) { \ - curr_sec = #sec; \ - if ((last_sec == NULL) || (strcmp(curr_sec, last_sec) != 0)) \ - fprintf(inifp, "\n[%s]\n", curr_sec); \ - fprintf(inifp, "%s = %s ; (instance %u)\n", \ - #key, cfg->sec##_##key[u], u+1); \ - last_sec = curr_sec; \ - } \ - } - - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI fflush(inifp); } @@ -291,19 +225,9 @@ int unifyfs_config_set_defaults(unifyfs_cfg_t* cfg) if (0 != strcmp(val, "NULLSTRING")) \ cfg->sec##_##key = strdup(val); -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ - cfg->n_##sec##_##key = 0; \ - memset((void *)cfg->sec##_##key, 0, sizeof(cfg->sec##_##key)); - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - cfg->n_##sec##_##key = 0; \ - memset((void *)cfg->sec##_##key, 0, sizeof(cfg->sec##_##key)); - - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI return (int)UNIFYFS_SUCCESS; } @@ -320,17 +244,9 @@ void unifyfs_config_cli_usage(char* arg0) fprintf(stderr, " -%c,--%s-%s <%s>\t%s (default value: %s)\n", \ opt, #sec, #key, #typ, use, stringify(dv)); -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - fprintf(stderr, " -%c,--%s-%s <%s>\t%s (multiple values supported - max %u entries)\n", \ - opt, #sec, #key, #typ, use, me); - - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI fflush(stderr); } @@ -351,14 +267,11 @@ static struct option cli_options[] = { #define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) #define UNIFYFS_CFG_CLI(sec, key, typ, dv, desc, vfn, opt, use) \ { #sec "-" #key, required_argument, NULL, opt }, -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - { #sec "-" #key, required_argument, NULL, opt }, + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI + { NULL, 0, NULL, 0 } }; @@ -397,25 +310,10 @@ int unifyfs_config_process_cli_args(unifyfs_cfg_t* cfg, short_opts[sndx++] = ':'; \ cli_options[ondx++].has_arg = required_argument; \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - short_opts[sndx++] = opt; \ - if (strcmp(#typ, "BOOL") == 0) { \ - short_opts[sndx++] = ':'; \ - short_opts[sndx++] = ':'; \ - cli_options[ondx++].has_arg = optional_argument; \ - } \ - else { \ - short_opts[sndx++] = ':'; \ - cli_options[ondx++].has_arg = required_argument; \ - } - - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI //fprintf(stderr, "UNIFYFS CONFIG DEBUG: short-opts '%s'\n", short_opts); @@ -440,21 +338,10 @@ int unifyfs_config_process_cli_args(unifyfs_cfg_t* cfg, break; \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - case opt: { \ - if (cfg->sec##_##key[cfg->n_##sec##_##key] != NULL) \ - free(cfg->sec##_##key[cfg->n_##sec##_##key]; \ - cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(optarg); \ - break; \ - } - UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI + case ':': usage_err = 1; @@ -547,33 +434,9 @@ int unifyfs_config_process_environ(unifyfs_cfg_t* cfg) cfg->sec##_##key = strdup(envval); \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ - for (u = 0; u < me; u++) { \ - envval = getenv_helper(#sec, #key, u+1); \ - if (envval != NULL) { \ - if (cfg->sec##_##key[u] != NULL) \ - free(cfg->sec##_##key[u]); \ - cfg->sec##_##key[u] = strdup(envval); \ - cfg->n_##sec##_##key++; \ - } \ - } - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - for (u = 0; u < me; u++) { \ - envval = getenv_helper(#sec, #key, u+1); \ - if (envval != NULL) { \ - if (cfg->sec##_##key[u] != NULL) \ - free(cfg->sec##_##key[u]); \ - cfg->sec##_##key[u] = strdup(envval); \ - cfg->n_##sec##_##key++; \ - } \ - } - - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI return (int)UNIFYFS_SUCCESS; } @@ -618,21 +481,10 @@ int inih_config_handler(void* user, } \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ - else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ - cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(val); \ - } - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ - cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(val); \ - } - -UNIFYFS_CONFIGS + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI + return 1; } @@ -751,21 +603,9 @@ int unifyfs_config_process_option(unifyfs_cfg_t* cfg, } \ } -#define UNIFYFS_CFG_MULTI(sec, key, typ, desc, vfn, me) \ - else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ - cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(opt_val); \ - } - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, desc, vfn, me, opt, use) \ - else if ((strcmp(section, #sec) == 0) && (strcmp(kee, #key) == 0)) { \ - cfg->sec##_##key[cfg->n_##sec##_##key++] = strdup(opt_val); \ - } - -UNIFYFS_CONFIGS + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI } @@ -778,7 +618,7 @@ int unifyfs_config_process_options(unifyfs_cfg_t* cfg, unifyfs_cfg_option* options) { if (nopt > 0) { - if (NULL == options) { + if ((NULL == cfg) || (NULL == options)) { return EINVAL; } for (int i = 0; i < nopt; i++) { @@ -794,6 +634,71 @@ int unifyfs_config_process_options(unifyfs_cfg_t* cfg, return UNIFYFS_SUCCESS; } +int unifyfs_config_get_options(unifyfs_cfg_t* cfg, + int* nopt, + unifyfs_cfg_option** options) +{ + if ((NULL == cfg) || (NULL == nopt) || (NULL == options)) { + return EINVAL; + } + + *nopt = 0; + *options = NULL; + + /* first, count the non-NULL settings */ + int num_set = 0; + +#define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ + if (cfg->sec##_##key != NULL) { \ + num_set++; \ + } + +#define UNIFYFS_CFG_CLI(sec, key, typ, dv, desc, vfn, opt, use) \ + if (cfg->sec##_##key != NULL) { \ + num_set++; \ + } + + UNIFYFS_CONFIGS +#undef UNIFYFS_CFG +#undef UNIFYFS_CFG_CLI + + /* now, allocate and fill the options array */ + unifyfs_cfg_option* opts = calloc(num_set, sizeof(unifyfs_cfg_option)); + if (NULL == opts) { + return ENOMEM; + } + + int opt_ndx = 0; + unifyfs_cfg_option* curr_opt; + char kee[256]; + +#define UNIFYFS_CFG(sec, key, typ, dv, desc, vfn) \ + if (cfg->sec##_##key != NULL) { \ + curr_opt = opts + opt_ndx; \ + opt_ndx++; \ + snprintf(kee, sizeof(kee), "%s.%s", #sec, #key); \ + curr_opt->opt_name = strdup(kee); \ + curr_opt->opt_value = strdup(cfg->sec##_##key); \ + } + +#define UNIFYFS_CFG_CLI(sec, key, typ, dv, desc, vfn, opt, use) \ + if (cfg->sec##_##key != NULL) { \ + curr_opt = opts + opt_ndx; \ + opt_ndx++; \ + snprintf(kee, sizeof(kee), "%s.%s", #sec, #key); \ + curr_opt->opt_name = strdup(kee); \ + curr_opt->opt_value = strdup(cfg->sec##_##key); \ + } + + UNIFYFS_CONFIGS +#undef UNIFYFS_CFG +#undef UNIFYFS_CFG_CLI + + *nopt = num_set; + *options = opts; + return UNIFYFS_SUCCESS; +} + /* predefined validation functions */ // utility routine to validate a single value given function @@ -888,7 +793,7 @@ int unifyfs_config_validate(unifyfs_cfg_t* cfg) } \ } - UNIFYFS_CONFIGS; + UNIFYFS_CONFIGS #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI #undef UNIFYFS_CFG_MULTI diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index aa7965091..e76accc3c 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -67,7 +67,7 @@ UNIFYFS_CFG_CLI(unifyfs, cleanup, BOOL, off, "cleanup storage on server exit", NULL, 'C', "on|off") \ UNIFYFS_CFG_CLI(unifyfs, configfile, STRING, /etc/unifyfs.conf, "path to configuration file", configurator_file_check, 'f', "specify full path to config file") \ UNIFYFS_CFG_CLI(unifyfs, consistency, STRING, LAMINATED, "consistency model", NULL, 'c', "specify consistency model (NONE | LAMINATED | POSIX)") \ - UNIFYFS_CFG_CLI(unifyfs, daemonize, BOOL, on, "enable server daemonization", NULL, 'D', "on|off") \ + UNIFYFS_CFG_CLI(unifyfs, daemonize, BOOL, off, "enable server daemonization", NULL, 'D', "on|off") \ UNIFYFS_CFG_CLI(unifyfs, mountpoint, STRING, /unifyfs, "mountpoint directory", NULL, 'm', "specify full path to desired mountpoint") \ UNIFYFS_CFG(client, cwd, STRING, NULLSTRING, "current working directory", NULL) \ UNIFYFS_CFG(client, fsync_persist, BOOL, on, "persist written data to storage on fsync()", NULL) \ @@ -119,20 +119,10 @@ typedef struct { #define UNIFYFS_CFG_CLI(sec, key, typ, dv, desc, vfn, opt, use) \ char *sec##_##key; -#define UNIFYFS_CFG_MULTI(sec, key, typ, dv, desc, vfn, me) \ - char *sec##_##key[me]; \ - unsigned n_##sec##_##key; - -#define UNIFYFS_CFG_MULTI_CLI(sec, key, typ, dv, desc, vfn, me, opt, use) \ - char *sec##_##key[me]; \ - unsigned n_##sec##_##key; - UNIFYFS_CONFIGS - #undef UNIFYFS_CFG #undef UNIFYFS_CFG_CLI -#undef UNIFYFS_CFG_MULTI -#undef UNIFYFS_CFG_MULTI_CLI + } unifyfs_cfg_t; /* initialization and cleanup */ @@ -177,6 +167,9 @@ int unifyfs_config_process_options(unifyfs_cfg_t* cfg, int nopt, unifyfs_cfg_option* options); +int unifyfs_config_get_options(unifyfs_cfg_t* cfg, + int* nopt, + unifyfs_cfg_option** options); int unifyfs_config_validate(unifyfs_cfg_t* cfg); diff --git a/docs/configuration.rst b/docs/configuration.rst index 675bf6291..a815c3d56 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -56,7 +56,7 @@ a given section and key. cleanup BOOL cleanup storage on server exit (default: off) configfile STRING path to custom configuration file consistency STRING consistency model [ LAMINATED | POSIX | NONE ] - daemonize BOOL enable server daemonization (default: on) + daemonize BOOL enable server daemonization (default: off) mountpoint STRING mountpoint path prefix (default: /unifyfs) ============= ====== =============================================== diff --git a/t/api/api_suite.c b/t/api/api_suite.c index 9284613c8..09e5d046e 100644 --- a/t/api/api_suite.c +++ b/t/api/api_suite.c @@ -57,6 +57,8 @@ int main(int argc, char* argv[]) rc = api_initialize_test(unifyfs_root, &fshdl); if (rc == UNIFYFS_SUCCESS) { + api_config_test(unifyfs_root, &fshdl); + api_create_open_remove_test(unifyfs_root, &fshdl); api_write_read_sync_stat_test(unifyfs_root, &fshdl, diff --git a/t/api/api_suite.h b/t/api/api_suite.h index 48d59abca..d1e571df6 100644 --- a/t/api/api_suite.h +++ b/t/api/api_suite.h @@ -35,6 +35,10 @@ int api_initialize_test(char* unifyfs_root, unifyfs_handle* fshdl); +/* Tests API get-configuration */ +int api_config_test(char* unifyfs_root, + unifyfs_handle* fshdl); + /* Tests API finalization */ int api_finalize_test(char* unifyfs_root, unifyfs_handle* fshdl); diff --git a/t/api/init-fini.c b/t/api/init-fini.c index ea4602559..317ceb241 100644 --- a/t/api/init-fini.c +++ b/t/api/init-fini.c @@ -17,7 +17,7 @@ int api_initialize_test(char* unifyfs_root, unifyfs_handle* fshdl) { - diag("Starting API initialization tests"); + diag("Starting API initialization test"); int n_configs = 1; unifyfs_cfg_option chk_size = { .opt_name = "logio.chunk_size", @@ -28,20 +28,46 @@ int api_initialize_test(char* unifyfs_root, "%s:%d unifyfs_initialize() is successful: rc=%d (%s)", __FILE__, __LINE__, rc, unifyfs_rc_enum_description(rc)); - diag("Finished API initialization tests"); + diag("Finished API initialization test"); + return rc; +} + +int api_config_test(char* unifyfs_root, + unifyfs_handle* fshdl) +{ + diag("Starting API get-configuration test"); + + int n_opt; + unifyfs_cfg_option* options; + int rc = unifyfs_get_config(*fshdl, &n_opt, &options); + ok(rc == UNIFYFS_SUCCESS && NULL != options, + "%s:%d unifyfs_get_config() is successful: rc=%d (%s)", + __FILE__, __LINE__, rc, unifyfs_rc_enum_description(rc)); + + if (NULL != options) { + for (int i = 0; i < n_opt; i++) { + unifyfs_cfg_option* opt = options + i; + diag("UNIFYFS CONFIG: %s = %s", opt->opt_name, opt->opt_value); + free((void*)opt->opt_name); + free((void*)opt->opt_value); + } + free(options); + } + + diag("Finished API get-configuration test"); return rc; } int api_finalize_test(char* unifyfs_root, unifyfs_handle* fshdl) { - diag("Starting API finalization tests"); + diag("Starting API finalization test"); int rc = unifyfs_finalize(*fshdl); ok(rc == UNIFYFS_SUCCESS, "%s:%d unifyfs_finalize() is successful: rc=%d (%s)", __FILE__, __LINE__, rc, unifyfs_rc_enum_description(rc)); - diag("Finished API finalization tests"); + diag("Finished API finalization test"); return rc; } From 85a7f9b26944a967391bf655e8a26c8c3c7f9769 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Tue, 3 Aug 2021 08:32:18 -0400 Subject: [PATCH 37/81] add client unlink callback This callback informs clients that they should cleanup any associated file state, including releasing storage for unsynced extents. The callback rpc is only sent to clients that have previously registered extents with their local server. Also: * adds an integer flag to the logio header to avoid concurrent updates by the client and server. * adds a program option to examples to unlink target file * adds a new arraylist_sort() method --- client/src/margo_client.c | 52 +++++++ client/src/margo_client.h | 1 + client/src/unifyfs_api_file.c | 13 +- client/src/unifyfs_api_internal.h | 3 + client/src/unifyfs_fid.c | 34 +++-- common/src/arraylist.c | 75 +++++++--- common/src/arraylist.h | 40 +++++- common/src/unifyfs_client_rpcs.h | 19 +++ common/src/unifyfs_logio.c | 40 ++++++ examples/src/checkpoint-restart.c | 9 ++ examples/src/testutil.h | 59 +++++--- examples/src/write-transfer.c | 9 ++ examples/src/write.c | 9 ++ examples/src/writeread.c | 9 ++ server/src/margo_server.c | 71 +++++++++- server/src/margo_server.h | 6 + server/src/unifyfs_global.h | 3 - server/src/unifyfs_inode.c | 60 ++++++++ server/src/unifyfs_request_manager.c | 200 ++++++++++++++++++++++++++- server/src/unifyfs_request_manager.h | 19 +++ server/src/unifyfs_server.c | 3 + 21 files changed, 663 insertions(+), 71 deletions(-) diff --git a/client/src/margo_client.c b/client/src/margo_client.c index 3878c9cdc..714b2640d 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -74,6 +74,7 @@ static void register_client_rpcs(client_rpc_context_t* ctx) CLIENT_REGISTER_RPC_HANDLER(mread_req_data); CLIENT_REGISTER_RPC_HANDLER(mread_req_complete); CLIENT_REGISTER_RPC_HANDLER(transfer_complete); + CLIENT_REGISTER_RPC_HANDLER(unlink_callback); #undef CLIENT_REGISTER_RPC_HANDLER } @@ -1122,3 +1123,54 @@ static void unifyfs_transfer_complete_rpc(hg_handle_t handle) margo_destroy(handle); } DEFINE_MARGO_RPC_HANDLER(unifyfs_transfer_complete_rpc) + +/* unlink callback rpc */ +static void unifyfs_unlink_callback_rpc(hg_handle_t handle) +{ + int ret; + + /* get input params */ + unifyfs_unlink_callback_in_t in; + hg_return_t hret = margo_get_input(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_input() failed"); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* lookup client */ + unifyfs_client* client; + int client_app = (int) in.app_id; + int client_id = (int) in.client_id; + client = unifyfs_find_client(client_app, client_id, NULL); + if (NULL == client) { + /* unknown client */ + ret = EINVAL; + } else { + int gfid = (int) in.gfid; + int fid = unifyfs_fid_from_gfid(client, gfid); + if (-1 != fid) { + unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, + fid); + if ((meta != NULL) && (fid == meta->fid)) { + meta->pending_unlink = 1; + } + } + ret = UNIFYFS_SUCCESS; + } + margo_free_input(handle, &in); + } + + /* set rpc result status */ + unifyfs_unlink_callback_out_t out; + out.ret = ret; + + /* return to caller */ + LOGDBG("responding"); + hret = margo_respond(handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_respond() failed"); + } + + /* free margo resources */ + margo_destroy(handle); +} +DEFINE_MARGO_RPC_HANDLER(unifyfs_unlink_callback_rpc) diff --git a/client/src/margo_client.h b/client/src/margo_client.h index 0cfa27b3a..8964f6149 100644 --- a/client/src/margo_client.h +++ b/client/src/margo_client.h @@ -43,6 +43,7 @@ typedef struct ClientRpcIds { hg_id_t mread_req_data_id; hg_id_t mread_req_complete_id; hg_id_t transfer_complete_id; + hg_id_t unlink_callback_id; } client_rpcs_t; typedef struct ClientRpcContext { diff --git a/client/src/unifyfs_api_file.c b/client/src/unifyfs_api_file.c index ada064b05..e9bdadf07 100644 --- a/client/src/unifyfs_api_file.c +++ b/client/src/unifyfs_api_file.c @@ -239,13 +239,8 @@ unifyfs_rc unifyfs_remove(unifyfs_handle fshdl, } unifyfs_rc ret = UNIFYFS_SUCCESS; - - /* invoke unlink rpc */ int gfid = unifyfs_generate_gfid(filepath); - int rc = invoke_client_unlink_rpc(client, gfid); - if (rc != UNIFYFS_SUCCESS) { - ret = rc; - } + int rc; /* clean up the local state for this file (if any) */ int fid = unifyfs_fid_from_gfid(client, gfid); @@ -259,5 +254,11 @@ unifyfs_rc unifyfs_remove(unifyfs_handle fshdl, } } + /* invoke unlink rpc */ + rc = invoke_client_unlink_rpc(client, gfid); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; + } + return ret; } diff --git a/client/src/unifyfs_api_internal.h b/client/src/unifyfs_api_internal.h index 086745a34..934531f9d 100644 --- a/client/src/unifyfs_api_internal.h +++ b/client/src/unifyfs_api_internal.h @@ -34,9 +34,12 @@ typedef struct { int fid; /* local file index in filemetas array */ int storage; /* FILE_STORAGE type */ + int pending_unlink; /* received unlink callback */ + int needs_sync; /* have unsynced writes */ struct seg_tree extents_sync; /* Segment tree containing our coalesced * writes between sync operations */ + struct seg_tree extents; /* Segment tree of all local data extents */ unifyfs_file_attr_t attrs; /* UnifyFS and POSIX file attributes */ diff --git a/client/src/unifyfs_fid.c b/client/src/unifyfs_fid.c index 8822f4ca1..fbbd98a14 100644 --- a/client/src/unifyfs_fid.c +++ b/client/src/unifyfs_fid.c @@ -90,7 +90,7 @@ static int fid_storage_free(unifyfs_client* client, { /* get meta data for this file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); - if ((meta != NULL) && (meta->fid == fid)) { + if ((meta != NULL) && (fid == meta->fid)) { if (meta->storage == FILE_STORAGE_LOGIO) { /* client needs to release unsynced write extents, since server * does not know about them */ @@ -122,6 +122,8 @@ static int fid_storage_free(unifyfs_client* client, /* set storage type back to NULL */ meta->storage = FILE_STORAGE_NULL; + meta->fid = -1; + return UNIFYFS_SUCCESS; } @@ -240,9 +242,10 @@ int unifyfs_fid_create_file(unifyfs_client* client, meta->attrs.ctime = tp; /* set UnifyFS client metadata */ - meta->fid = fid; - meta->storage = FILE_STORAGE_NULL; - meta->needs_sync = 0; + meta->fid = fid; + meta->storage = FILE_STORAGE_NULL; + meta->needs_sync = 0; + meta->pending_unlink = 0; return fid; } @@ -611,12 +614,6 @@ int unifyfs_fid_unlink(unifyfs_client* client, /* invoke unlink rpc */ int gfid = unifyfs_gfid_from_fid(client, fid); - rc = invoke_client_unlink_rpc(client, gfid); - if (rc != UNIFYFS_SUCCESS) { - /* TODO: if item does not exist globally, but just locally, - * we still want to delete item locally */ - return rc; - } /* finalize the storage we're using for this file */ rc = unifyfs_fid_delete(client, fid); @@ -627,6 +624,11 @@ int unifyfs_fid_unlink(unifyfs_client* client, return rc; } + rc = invoke_client_unlink_rpc(client, gfid); + if (rc != UNIFYFS_SUCCESS) { + return rc; + } + return UNIFYFS_SUCCESS; } @@ -644,6 +646,18 @@ unifyfs_filemeta_t* unifyfs_get_meta_from_fid(unifyfs_client* client, if (fid >= 0 && fid < client->max_files) { /* get a pointer to the file meta data structure */ unifyfs_filemeta_t* meta = &(client->unifyfs_filemetas[fid]); + + if (fid == meta->fid) { + /* before returning metadata, process any pending callbacks */ + if (meta->pending_unlink) { + LOGDBG("processing pending global unlink"); + meta->pending_unlink = 0; + int rc = unifyfs_fid_delete(client, fid); + if (UNIFYFS_SUCCESS != rc) { + LOGERR("fid delete failed"); + } + } + } return meta; } return NULL; diff --git a/common/src/arraylist.c b/common/src/arraylist.c index f9f6f5383..00c902194 100644 --- a/common/src/arraylist.c +++ b/common/src/arraylist.c @@ -32,6 +32,9 @@ #include #include +/* Create an arraylist with the given capacity. + * If capacity == 0, use the default ARRAYLIST_CAPACITY. + * Returns the new arraylist, or NULL on error. */ arraylist_t* arraylist_create(int capacity) { arraylist_t* arr = (arraylist_t*) malloc(sizeof(arraylist_t)); @@ -54,6 +57,7 @@ arraylist_t* arraylist_create(int capacity) return arr; } +/* Returns the arraylist capacity in elements, or -1 on error */ int arraylist_capacity(arraylist_t* arr) { if (NULL == arr) { @@ -62,6 +66,7 @@ int arraylist_capacity(arraylist_t* arr) return arr->cap; } +/* Returns the current arraylist size in elements, or -1 on error */ int arraylist_size(arraylist_t* arr) { if (NULL == arr) { @@ -70,6 +75,40 @@ int arraylist_size(arraylist_t* arr) return arr->size; } +/* Reset the arraylist size to zero */ +int arraylist_reset(arraylist_t* arr) +{ + if (NULL == arr) { + return -1; + } + + arr->size = 0; + + return 0; +} + +/* Free all arraylist elements, the array storage, and the arraylist_t */ +int arraylist_free(arraylist_t* arr) +{ + if (NULL == arr) { + return -1; + } + + if (NULL != arr->elems) { + for (int i = 0; i < arr->cap; i++) { + if (arr->elems[i] != NULL) { + free(arr->elems[i]); + } + } + free(arr->elems); + } + + free(arr); + + return 0; +} + +/* Get the element at given position */ void* arraylist_get(arraylist_t* arr, int pos) { if ((NULL == arr) || (pos >= arr->size)) { @@ -78,6 +117,7 @@ void* arraylist_get(arraylist_t* arr, int pos) return arr->elems[pos]; } +/* Remove the element at given list index and return it */ void* arraylist_remove(arraylist_t* arr, int pos) { void* item = arraylist_get(arr, pos); @@ -99,7 +139,7 @@ void* arraylist_remove(arraylist_t* arr, int pos) return item; } -/* Inserts element at given index (pos) in the arraylist. +/* Insert the element at the given list index (pos) in the arraylist. * Overwrites (and frees) any existing element at that index. * Returns 0 on success, or -1 on error */ int arraylist_insert(arraylist_t* arr, int pos, void* elem) @@ -153,33 +193,28 @@ int arraylist_add(arraylist_t* arr, void* elem) } } -int arraylist_reset(arraylist_t* arr) +/* Sort the arraylist elements using the given comparison function (cmpfn). + * Note that the comparison function should properly handle NULL pointer + * elements of the array. + * Return 0 on success, -1 on error */ +int arraylist_sort(arraylist_t* arr, + int (*cmpfn)(const void *, const void *)) { if (NULL == arr) { return -1; } - arr->size = 0; - - return 0; -} + /* sort using provided comparison function */ + qsort(arr->elems, arr->cap, sizeof(void*), cmpfn); -int arraylist_free(arraylist_t* arr) -{ - if (NULL == arr) { - return -1; - } - - if (NULL != arr->elems) { - for (int i = 0; i < arr->cap; i++) { - if (arr->elems[i] != NULL) { - free(arr->elems[i]); - } + /* adjust size to match last used index */ + int last_used_pos = -1; + for (int i = 0; i < arr->cap; i++) { + if (arr->elems[i] != NULL) { + last_used_pos = i; } - free(arr->elems); } - - free(arr); + arr->size = last_used_pos + 1; return 0; } diff --git a/common/src/arraylist.h b/common/src/arraylist.h index e129bdbfe..2f5545c2e 100644 --- a/common/src/arraylist.h +++ b/common/src/arraylist.h @@ -39,14 +39,44 @@ typedef struct { void** elems; } arraylist_t; +/* Create an arraylist with the given capacity. + * If capacity == 0, use the default ARRAYLIST_CAPACITY. + * Returns the new arraylist, or NULL on error. */ arraylist_t* arraylist_create(int capacity); -int arraylist_add(arraylist_t* arr, void* elem); + +/* Returns the arraylist capacity in elements, or -1 on error */ +int arraylist_capacity(arraylist_t* arr); + +/* Returns the current arraylist size in elements, or -1 on error */ +int arraylist_size(arraylist_t* arr); + +/* Reset the arraylist size to zero */ int arraylist_reset(arraylist_t* arr); -int arraylist_free(arraylist_t* arr); -int arraylist_insert(arraylist_t* arr, int pos, void* elem); + +/* Get the element at the given list index (pos)) */ void* arraylist_get(arraylist_t* arr, int pos); + +/* Remove the element at given list index (pos) and return it */ void* arraylist_remove(arraylist_t* arr, int pos); -int arraylist_capacity(arraylist_t* arr); -int arraylist_size(arraylist_t* arr); + +/* Free all arraylist elements, the array storage, and the arraylist_t. + * Returns 0 on success, -1 on error */ +int arraylist_free(arraylist_t* arr); + +/* Adds element to the end of the current list. + * Returns list index of newly added element, or -1 on error */ +int arraylist_add(arraylist_t* arr, void* elem); + +/* Insert the element at the given list index (pos) in the arraylist. + * Overwrites (and frees) any existing element at that index. + * Returns 0 on success, or -1 on error */ +int arraylist_insert(arraylist_t* arr, int pos, void* elem); + +/* Sort the arraylist elements using the given comparison function (cmpfn). + * Note that the comparison function should properly handle NULL pointer + * elements of the array. + * Return 0 on success, -1 on error */ +int arraylist_sort(arraylist_t* arr, + int (*cmpfn)(const void *, const void *)); #endif diff --git a/common/src/unifyfs_client_rpcs.h b/common/src/unifyfs_client_rpcs.h index d13729631..4cf731749 100644 --- a/common/src/unifyfs_client_rpcs.h +++ b/common/src/unifyfs_client_rpcs.h @@ -47,6 +47,13 @@ typedef enum { UNIFYFS_CLIENT_RPC_UNMOUNT } client_rpc_e; +typedef enum { + UNIFYFS_CLIENT_CALLBACK_INVALID = 0, + UNIFYFS_CLIENT_CALLBACK_LAMINATE, + UNIFYFS_CLIENT_CALLBACK_TRUNCATE, + UNIFYFS_CLIENT_CALLBACK_UNLINK +} client_callback_e; + /* unifyfs_attach_rpc (client => server) * * initialize server access to client's shared memory and file state */ @@ -189,6 +196,18 @@ MERCURY_GEN_PROC(unifyfs_unlink_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(unifyfs_unlink_rpc) +/* unifyfs_unlink_callback_rpc (server => client) + * + * given an app_id, client_id, and global file id, + * free the client metadata and data associated with the file */ +MERCURY_GEN_PROC(unifyfs_unlink_callback_in_t, + ((int32_t)(app_id)) + ((int32_t)(client_id)) + ((int32_t)(gfid))) +MERCURY_GEN_PROC(unifyfs_unlink_callback_out_t, + ((int32_t)(ret))) +DECLARE_MARGO_RPC_HANDLER(unifyfs_unlink_callback_rpc) + /* unifyfs_laminate_rpc (client => server) * * given an app_id, client_id, and global file id, diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c index 1ce9663d5..ed752adbd 100644 --- a/common/src/unifyfs_logio.c +++ b/common/src/unifyfs_logio.c @@ -40,10 +40,28 @@ typedef struct log_header { size_t reserved_sz; /* reserved data bytes */ size_t chunk_sz; /* data chunk size */ off_t data_offset; /* file/memory offset where data chunks start */ + + volatile int updating; /* flag to prevent client/server update races */ } log_header; /* chunk slot_map immediately follows header and occupies rest of the page */ // slot_map chunk_map; /* chunk slot_map that tracks reservations */ +inline void LOCK_LOG_HEADER(log_header* hdr) +{ + assert(NULL != hdr); + while (hdr->updating) { + usleep(10); + } + hdr->updating = 1; +} + +inline void UNLOCK_LOG_HEADER(log_header* hdr) +{ + assert(NULL != hdr); + assert(hdr->updating); + hdr->updating = 0; +} + static inline slot_map* log_header_to_chunkmap(log_header* hdr) { @@ -548,6 +566,7 @@ int unifyfs_logio_alloc(logio_context* ctx, if (NULL != ctx->shmem) { /* get shmem log header and chunk slotmap */ shmem_hdr = (log_header*) ctx->shmem->addr; + LOCK_LOG_HEADER(shmem_hdr); chunkmap = log_header_to_chunkmap(shmem_hdr); /* calculate number of chunks needed for requested bytes */ @@ -561,6 +580,7 @@ int unifyfs_logio_alloc(logio_context* ctx, /* success, all needed chunks allocated in shmem */ allocated_bytes = res_chunks * chunk_sz; shmem_hdr->reserved_sz += allocated_bytes; + UNLOCK_LOG_HEADER(shmem_hdr); res_off = (off_t)(res_slot * chunk_sz); *log_offset = res_off; return UNIFYFS_SUCCESS; @@ -583,12 +603,15 @@ int unifyfs_logio_alloc(logio_context* ctx, mem_res_nchk = res_chunks; mem_res_at_end = 1; } + } else { + UNLOCK_LOG_HEADER(shmem_hdr); } } if (NULL != ctx->spill_hdr) { /* get spill log header and chunk slotmap */ spill_hdr = (log_header*) ctx->spill_hdr; + LOCK_LOG_HEADER(spill_hdr); chunkmap = log_header_to_chunkmap(spill_hdr); /* calculate number of chunks needed for remaining bytes */ @@ -603,6 +626,7 @@ int unifyfs_logio_alloc(logio_context* ctx, if (0 == mem_res_at_end) { /* success, full reservation in spill */ spill_hdr->reserved_sz += allocated_bytes; + UNLOCK_LOG_HEADER(spill_hdr); res_off = (off_t)(res_slot * chunk_sz); if (NULL != shmem_hdr) { /* update log offset to account for shmem log size */ @@ -629,6 +653,7 @@ int unifyfs_logio_alloc(logio_context* ctx, if (rc != UNIFYFS_SUCCESS) { LOGERR("slotmap_release() for logio shmem failed"); } + UNLOCK_LOG_HEADER(shmem_hdr); mem_res_slot = 0; mem_res_nchk = 0; mem_allocation = 0; @@ -642,6 +667,7 @@ int unifyfs_logio_alloc(logio_context* ctx, /* success, full reservation in spill */ allocated_bytes = res_chunks * chunk_sz; spill_hdr->reserved_sz += allocated_bytes; + UNLOCK_LOG_HEADER(spill_hdr); res_off = (off_t)(res_slot * chunk_sz); if (NULL != shmem_hdr) { /* update log offset to include shmem log size */ @@ -653,11 +679,15 @@ int unifyfs_logio_alloc(logio_context* ctx, } else { /* successful reservation spanning shmem and spill */ shmem_hdr->reserved_sz += mem_allocation; + UNLOCK_LOG_HEADER(shmem_hdr); spill_hdr->reserved_sz += allocated_bytes; + UNLOCK_LOG_HEADER(spill_hdr); *log_offset = res_off; return UNIFYFS_SUCCESS; } } + } else { + UNLOCK_LOG_HEADER(spill_hdr); } } @@ -669,6 +699,7 @@ int unifyfs_logio_alloc(logio_context* ctx, if (rc != UNIFYFS_SUCCESS) { LOGERR("slotmap_release() for logio shmem failed"); } + UNLOCK_LOG_HEADER(shmem_hdr); } LOGDBG("returning ENOSPC"); return ENOSPC; @@ -699,6 +730,7 @@ int unifyfs_logio_free(logio_context* ctx, } /* determine chunk allocations based on log offset */ + size_t released_bytes; size_t sz_in_mem = 0; size_t sz_in_spill = 0; off_t spill_offset = 0; @@ -711,26 +743,34 @@ int unifyfs_logio_free(logio_context* ctx, size_t chunk_sz, chunk_slot, num_chunks; if (sz_in_mem > 0) { /* release shared memory chunks */ + LOCK_LOG_HEADER(shmem_hdr); chunk_sz = shmem_hdr->chunk_sz; chunk_slot = log_offset / chunk_sz; num_chunks = bytes_to_chunks(sz_in_mem, chunk_sz); + released_bytes = chunk_sz * num_chunks; chunkmap = log_header_to_chunkmap(shmem_hdr); rc = slotmap_release(chunkmap, chunk_slot, num_chunks); if (rc != UNIFYFS_SUCCESS) { LOGERR("slotmap_release() for logio shmem failed"); } + shmem_hdr->reserved_sz -= released_bytes; + UNLOCK_LOG_HEADER(shmem_hdr); } if (sz_in_spill > 0) { /* release spill chunks */ spill_hdr = (log_header*) ctx->spill_hdr; + LOCK_LOG_HEADER(spill_hdr); chunk_sz = spill_hdr->chunk_sz; chunk_slot = spill_offset / chunk_sz; num_chunks = bytes_to_chunks(sz_in_spill, chunk_sz); + released_bytes = chunk_sz * num_chunks; chunkmap = log_header_to_chunkmap(spill_hdr); rc = slotmap_release(chunkmap, chunk_slot, num_chunks); if (rc != UNIFYFS_SUCCESS) { LOGERR("slotmap_release() for logio spill failed"); } + spill_hdr->reserved_sz -= released_bytes; + UNLOCK_LOG_HEADER(spill_hdr); } return rc; } diff --git a/examples/src/checkpoint-restart.c b/examples/src/checkpoint-restart.c index be908b6b7..c8ab84542 100644 --- a/examples/src/checkpoint-restart.c +++ b/examples/src/checkpoint-restart.c @@ -358,6 +358,15 @@ int main(int argc, char* argv[]) test_print(cfg, "ERROR - Restart data verification failed!"); } + if (cfg->remove_target) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + // post-restart cleanup free(restart_data); free(req); diff --git a/examples/src/testutil.h b/examples/src/testutil.h index ad1290108..3c9c81438 100644 --- a/examples/src/testutil.h +++ b/examples/src/testutil.h @@ -117,29 +117,30 @@ const char* io_pattern_str(int pattern) typedef struct { /* program behavior options */ - int debug; /* during startup, wait for input at rank 0 */ - int verbose; /* print verbose information to stderr */ + int debug; /* during startup, wait for input at rank 0 */ + int verbose; /* print verbose information to stderr */ int use_mpi; int use_unifyfs; int enable_mpi_mount; /* automount during MPI_Init() */ char* output_file; /* print test messages to output file */ FILE* output_fp; - int reuse_filename; /* remove and then reuse filename from prior run*/ + int reuse_filename; /* remove and then reuse filename from prior run */ + int remove_target; /* remove the target file */ /* I/O behavior options */ - int io_pattern; /* N1 or NN */ - int io_check; /* use lipsum to verify data */ - int io_shuffle; /* read and write different extents */ + int io_pattern; /* N1 or NN */ + int io_check; /* use lipsum to verify data */ + int io_shuffle; /* read and write different extents */ int pre_wr_trunc; /* truncate file before writing */ int post_wr_trunc; /* truncate file after writing */ - int use_aio; /* use asynchronous IO */ - int use_api; /* use UnifyFS library API */ - int use_lio; /* use lio_listio instead of read/write */ - int use_mapio; /* use mmap instead of read/write */ - int use_mpiio; /* use MPI-IO instead of POSIX I/O */ - int use_prdwr; /* use pread/pwrite instead of read/write */ - int use_stdio; /* use fread/fwrite instead of read/write */ - int use_vecio; /* use readv/writev instead of read/write */ + int use_aio; /* use asynchronous IO */ + int use_api; /* use UnifyFS library API */ + int use_lio; /* use lio_listio instead of read/write */ + int use_mapio; /* use mmap instead of read/write */ + int use_mpiio; /* use MPI-IO instead of POSIX I/O */ + int use_prdwr; /* use pread/pwrite instead of read/write */ + int use_stdio; /* use fread/fwrite instead of read/write */ + int use_vecio; /* use readv/writev instead of read/write */ /* I/O size options */ uint64_t n_blocks; /* number of I/O blocks */ @@ -153,6 +154,7 @@ typedef struct { FILE* fp; int fd; int fd_access; /* access flags for cfg.fd */ + int is_open; /* flag to indicate if file is currently open */ void* mapped; /* address of mapped extent of cfg.fd */ off_t mapped_off; /* start offset for mapped extent */ size_t mapped_sz; /* size of mapped extent */ @@ -231,6 +233,7 @@ void test_config_print(test_cfg* cfg) fprintf(fp, "\t mpi_mount = %d\n", cfg->enable_mpi_mount); fprintf(fp, "\t outfile = %s\n", cfg->output_file); fprintf(fp, "\t reuse_fname = %d\n", cfg->reuse_filename); + fprintf(fp, "\t unlink = %d\n", cfg->remove_target); fprintf(fp, "\n-- IO Behavior --\n"); fprintf(fp, "\t io_pattern = %s\n", io_pattern_str(cfg->io_pattern)); @@ -520,7 +523,7 @@ int test_is_static(const char* program) // common options for all tests -static const char* test_short_opts = "Ab:c:dD:f:hklLm:Mn:No:p:PrSt:T:UvVx"; +static const char* test_short_opts = "Ab:c:dD:f:hklLm:Mn:No:p:PrSt:T:uUvVx"; static const struct option test_long_opts[] = { { "aio", 0, 0, 'A' }, @@ -544,6 +547,7 @@ static const struct option test_long_opts[] = { { "stdio", 0, 0, 'S' }, { "pre-truncate", 1, 0, 't' }, { "post-truncate", 1, 0, 'T' }, + { "unlink", 0, 0, 'u' }, { "disable-unifyfs", 0, 0, 'U' }, { "verbose", 0, 0, 'v' }, { "vecio", 0, 0, 'V' }, @@ -596,6 +600,8 @@ static const char* test_usage_str = " (default: off)\n" " -T, --post-truncate= truncate file to size (B) after writing\n" " (default: off)\n" + " -u, --unlink unlink target file\n" + " (default: off)\n" " -U, --disable-unifyfs do not use UnifyFS\n" " (default: enable UnifyFS)\n" " -v, --verbose print verbose information\n" @@ -709,6 +715,10 @@ int test_process_argv(test_cfg* cfg, cfg->trunc_size = (off_t) strtoul(optarg, NULL, 0); break; + case 'u': + cfg->remove_target = 1; + break; + case 'U': cfg->use_unifyfs = 0; break; @@ -1120,6 +1130,7 @@ int test_open_file(test_cfg* cfg, const char* filepath, int access) cfg->fd = fd; cfg->fd_access = access; } + cfg->is_open = 1; return 0; } @@ -1169,6 +1180,10 @@ int test_close_file(test_cfg* cfg) { assert(NULL != cfg); + if (!cfg->is_open) { + return 0; + } + if (cfg->use_api) { #ifndef DISABLE_UNIFYFS cfg->gfid = UNIFYFS_INVALID_GFID; @@ -1195,6 +1210,7 @@ int test_close_file(test_cfg* cfg) cfg->fd = -1; } + cfg->is_open = 0; return 0; } @@ -1230,6 +1246,9 @@ int test_remove_file(test_cfg* cfg, const char* filepath) assert(NULL != cfg); + /* close the file if it's still open */ + test_close_file(cfg); + /* stat file and simply return if it already doesn't exist */ rc = stat(filepath, &sb); if (rc) { @@ -1239,10 +1258,10 @@ int test_remove_file(test_cfg* cfg, const char* filepath) } if (cfg->use_mpiio) { - MPI_CHECK(cfg, (MPI_File_delete(filepath, MPI_INFO_NULL))); - if (mpi_error) { - return -1; + if (cfg->rank == 0 || cfg->io_pattern == IO_PATTERN_NN) { + MPI_CHECK(cfg, (MPI_File_delete(filepath, MPI_INFO_NULL))); } + test_barrier(cfg); return 0; } @@ -1272,6 +1291,7 @@ int test_remove_file(test_cfg* cfg, const char* filepath) } } } + test_barrier(cfg); return 0; } @@ -1304,6 +1324,7 @@ int test_create_file(test_cfg* cfg, const char* filepath, int access) if (mpi_error) { return -1; } + cfg->is_open = 1; return 0; } @@ -1340,6 +1361,7 @@ int test_create_file(test_cfg* cfg, const char* filepath, int access) cfg->fd = fd; cfg->fd_access = access; } + cfg->is_open = 1; } if (cfg->io_pattern == IO_PATTERN_N1) { @@ -1514,6 +1536,7 @@ void test_fini(test_cfg* cfg) return; } + /* close the target file if it's still open */ test_close_file(cfg); if (cfg->use_unifyfs) { diff --git a/examples/src/write-transfer.c b/examples/src/write-transfer.c index 9e8122cc4..98361dfcc 100644 --- a/examples/src/write-transfer.c +++ b/examples/src/write-transfer.c @@ -429,6 +429,15 @@ int main(int argc, char* argv[]) global_read_bw); } + if (cfg->remove_target) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + // cleanup free(target_file); free(destination_file); diff --git a/examples/src/write.c b/examples/src/write.c index ca90e8e79..cceaba3bb 100644 --- a/examples/src/write.c +++ b/examples/src/write.c @@ -307,6 +307,15 @@ int main(int argc, char* argv[]) eff_write_bw); } + if (cfg->remove_target) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + // cleanup free(target_file); diff --git a/examples/src/writeread.c b/examples/src/writeread.c index be8e4efc3..15c57ea00 100644 --- a/examples/src/writeread.c +++ b/examples/src/writeread.c @@ -449,6 +449,15 @@ int main(int argc, char* argv[]) global_read_bw); } + if (cfg->remove_target) { + test_print_verbose_once(cfg, + "DEBUG: removing file %s", target_file); + rc = test_remove_file(cfg, target_file); + if (rc) { + test_print(cfg, "ERROR - test_remove_file(%s) failed", target_file); + } + } + // cleanup free(target_file); diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 0065fbd2b..412d1f046 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -343,6 +343,12 @@ static void register_client_server_rpcs(margo_instance_id mid) unifyfs_transfer_complete_in_t, unifyfs_transfer_complete_out_t, NULL); + + unifyfsd_rpc_context->rpcs.client_unlink_callback_id = + MARGO_REGISTER(mid, "unifyfs_unlink_callback_rpc", + unifyfs_unlink_callback_in_t, + unifyfs_unlink_callback_out_t, + NULL); } /* margo_server_rpc_init @@ -668,7 +674,8 @@ int invoke_client_heartbeat_rpc(int app_id, hg_handle_t handle = create_client_handle(rpc_id, app_id, client_id); /* call rpc function */ - LOGDBG("invoking the heartbeat rpc function in client"); + LOGDBG("invoking the heartbeat rpc function in client[%d:%d]", + app_id, client_id); double timeout_msec = 500; /* half a second */ hret = margo_forward_timed(handle, &in, timeout_msec); if (hret != HG_SUCCESS) { @@ -736,7 +743,8 @@ int invoke_client_mread_req_data_rpc(int app_id, hg_handle_t handle = create_client_handle(rpc_id, app_id, client_id); /* call rpc function */ - LOGDBG("invoking the mread req data rpc function in client"); + LOGDBG("invoking the mread[%d] req data (index=%d) rpc function in " + "client[%d:%d]", mread_id, read_index, app_id, client_id); hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { LOGERR("margo_forward() failed"); @@ -794,8 +802,8 @@ int invoke_client_mread_req_complete_rpc(int app_id, hg_handle_t handle = create_client_handle(rpc_id, app_id, client_id); /* call rpc function */ - LOGDBG("invoking the mread[%d] complete rpc function in client", - mread_id); + LOGDBG("invoking the mread[%d] complete rpc function in client[%d:%d]", + mread_id, app_id, client_id); hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { LOGERR("margo_forward() failed"); @@ -847,8 +855,8 @@ int invoke_client_transfer_complete_rpc(int app_id, hg_handle_t handle = create_client_handle(rpc_id, app_id, client_id); /* call rpc function */ - LOGDBG("invoking the transfer[%d] complete rpc function in client", - transfer_id); + LOGDBG("invoking the transfer[%d] complete rpc function in client[%d:%d]", + transfer_id, app_id, client_id); hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { LOGERR("margo_forward() failed"); @@ -874,3 +882,54 @@ int invoke_client_transfer_complete_rpc(int app_id, return ret; } + +/* invokes the client mread request completion rpc function */ +int invoke_client_unlink_callback_rpc(int app_id, + int client_id, + int gfid) +{ + hg_return_t hret; + + /* check that we have initialized margo */ + if (NULL == unifyfsd_rpc_context) { + return UNIFYFS_FAILURE; + } + + /* fill input struct */ + unifyfs_unlink_callback_in_t in; + in.app_id = (int32_t) app_id; + in.client_id = (int32_t) client_id; + in.gfid = (int32_t) gfid; + + /* get handle to rpc function */ + hg_id_t rpc_id = unifyfsd_rpc_context->rpcs.client_unlink_callback_id; + hg_handle_t handle = create_client_handle(rpc_id, app_id, client_id); + + /* call rpc function */ + LOGDBG("invoking the unlink (gfid=%d) callback rpc function in " + "client[%d:%d]", gfid, app_id, client_id); + hret = margo_forward(handle, &in); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward() failed"); + margo_destroy(handle); + return UNIFYFS_ERROR_MARGO; + } + + /* decode response */ + int ret; + unifyfs_unlink_callback_out_t out; + hret = margo_get_output(handle, &out); + if (hret == HG_SUCCESS) { + LOGDBG("Got response ret=%" PRIi32, out.ret); + ret = (int) out.ret; + margo_free_output(handle, &out); + } else { + LOGERR("margo_get_output() failed"); + ret = UNIFYFS_ERROR_MARGO; + } + + /* free resources */ + margo_destroy(handle); + + return ret; +} diff --git a/server/src/margo_server.h b/server/src/margo_server.h index ae9fc7112..11a577b9a 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -55,6 +55,7 @@ typedef struct ServerRpcIds { hg_id_t client_mread_data_id; hg_id_t client_mread_complete_id; hg_id_t client_transfer_complete_id; + hg_id_t client_unlink_callback_id; } server_rpcs_t; typedef struct ServerRpcContext { @@ -108,4 +109,9 @@ int invoke_client_transfer_complete_rpc(int app_id, int transfer_id, int error_code); +/* invokes the client unlink callback rpc function */ +int invoke_client_unlink_callback_rpc(int app_id, + int client_id, + int gfid); + #endif // MARGO_SERVER_H diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index e3abb081d..be1fa6214 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -186,9 +186,6 @@ unifyfs_rc disconnect_app_client(app_client* clnt); unifyfs_rc cleanup_app_client(app_config* app, app_client* clnt); - -/* arraylist to track failed clients */ -arraylist_t* failed_clients; // = NULL unifyfs_rc add_failed_client(int app_id, int client_id); diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c index e45f1cd16..6bd31a4ac 100644 --- a/server/src/unifyfs_inode.c +++ b/server/src/unifyfs_inode.c @@ -20,6 +20,7 @@ #include "unifyfs_inode.h" #include "unifyfs_inode_tree.h" +#include "unifyfs_request_manager.h" struct unifyfs_inode_tree _global_inode_tree; struct unifyfs_inode_tree* global_inode_tree = &_global_inode_tree; @@ -114,6 +115,13 @@ int unifyfs_inode_create(int gfid, unifyfs_file_attr_t* attr) return ret; } +static int int_cmp_fn(const void* a, const void* b) +{ + int ai = *(int*)a; + int bi = *(int*)b; + return ai - bi; +} + static int unifyfs_inode_destroy(struct unifyfs_inode* ino) { @@ -125,9 +133,19 @@ int unifyfs_inode_destroy(struct unifyfs_inode* ino) } if (NULL != ino->extents) { + + /* allocate an array to track local clients to which we should + * send an unlink callback */ + size_t n_clients = 0; + size_t max_clients = (size_t) ino->extents->count; + int* local_clients = calloc(max_clients, sizeof(int)); + int last_client; + int cb_app_id = -1; + /* iterate over extents and release local logio allocations */ unifyfs_inode_rdlock(ino); { + last_client = -1; struct extent_tree* tree = ino->extents; struct extent_tree_node* curr = NULL; while (NULL != (curr = extent_tree_iter(tree, curr))) { @@ -150,6 +168,19 @@ int unifyfs_inode_destroy(struct unifyfs_inode* ino) "client[%d:%d] log_offset=%zu nbytes=%zu", app_id, client_id, (size_t)log_off, nbytes); } + + if (NULL != local_clients) { + if (-1 == cb_app_id) { + cb_app_id = app_id; + } + /* add client id to local clients array */ + if (last_client != client_id) { + assert(n_clients < max_clients); + local_clients[n_clients] = client_id; + n_clients++; + } + last_client = client_id; + } } } } @@ -157,6 +188,35 @@ int unifyfs_inode_destroy(struct unifyfs_inode* ino) extent_tree_destroy(ino->extents); free(ino->extents); + + if (NULL != local_clients) { + qsort(local_clients, n_clients, sizeof(int), int_cmp_fn); + last_client = -1; + for (size_t i = 0; i < n_clients; i++) { + int cb_client_id = local_clients[i]; + if (cb_client_id == last_client) { + continue; + } + last_client = cb_client_id; + + /* submit a request to the client's reqmgr thread + * to cleanup client state */ + client_callback_req* cb = malloc(sizeof(*cb)); + if (NULL != cb) { + cb->req_type = UNIFYFS_CLIENT_CALLBACK_UNLINK; + cb->app_id = cb_app_id; + cb->client_id = cb_client_id; + cb->gfid = ino->gfid; + int rc = rm_submit_client_callback_request(cb); + if (UNIFYFS_SUCCESS != rc) { + LOGERR("failed to submit unlink callback " + "req to client[%d:%d]", + cb_app_id, cb_client_id); + } + } + } + free(local_clients); + } } pthread_rwlock_destroy(&(ino->rwlock)); diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index 4fc161505..864364cae 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -139,6 +139,16 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) return NULL; } + /* allocate a list to track client rpc requests */ + thrd_ctrl->client_callbacks = + arraylist_create(UNIFYFS_CLIENT_MAX_FILES); + if (thrd_ctrl->client_callbacks == NULL) { + LOGERR("failed to allocate request manager client_callbacks!"); + pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); + free(thrd_ctrl); + return NULL; + } + /* record app and client id this thread will be serving */ thrd_ctrl->app_id = app_id; thrd_ctrl->client_id = client_id; @@ -802,6 +812,186 @@ int rm_handle_chunk_read_responses(reqmgr_thrd_t* thrd_ctrl, return ret; } +/* submit a client callback request to the request manager thread */ +int rm_submit_client_callback_request(client_callback_req* req) +{ + assert(req != NULL); + + /* get application client */ + app_client* client = get_app_client(req->app_id, req->client_id); + if (NULL == client) { + LOGERR("app client [%d:%d] lookup failed", + req->app_id, req->client_id); + return EINVAL; + } + + /* LOGDBG("client callback: client=[%d:%d], type=%d, gfid=%d", + * req->app_id, req->client_id, req->req_type, req->gfid); */ + + /* get thread control structure */ + reqmgr_thrd_t* reqmgr = client->reqmgr; + assert(NULL != reqmgr); + RM_REQ_LOCK(reqmgr); + arraylist_add(reqmgr->client_callbacks, req); + RM_REQ_UNLOCK(reqmgr); + + signal_new_requests(reqmgr); + + return UNIFYFS_SUCCESS; +} + +/* this qsort() comparison function groups callbacks by type, then + * client + gfid. It also pushes any NULL elements to the end of the array */ +static int cb_arraylist_compare(const void* a, const void* b) +{ + const void* elema = *(const void**)a; + const void* elemb = *(const void**)b; + + /* first handle the NULL cases (use 'NULL > ptr' to push NULLs to end) */ + if (NULL == elema) { + if (NULL == elemb) { + return 0; + } else { + return 1; + } + } else if (NULL == elemb) { + return -1; + } + + /* now compare the callback requests */ + const client_callback_req* reqa = elema; + const client_callback_req* reqb = elemb; + + if (reqa->req_type < reqb->req_type) { + return -1; + } else if (reqa->req_type > reqb->req_type) { + return 1; + } else { // request types are equal + if (reqa->app_id < reqb->app_id) { + return -1; + } else if (reqa->app_id > reqb->app_id) { + return 1; + } else { // app_ids are equal + if (reqa->client_id < reqb->client_id) { + return -1; + } else if (reqa->client_id > reqb->client_id) { + return 1; + } else { // client_ids are equal + if (reqa->gfid < reqb->gfid) { + return -1; + } else if (reqa->gfid > reqb->gfid) { + return 1; + } else { // gfids are equal + return 0; + } + } + } + } +} + +/* iterate over list of callbacks and invoke rpcs */ +static int rm_process_client_callbacks(reqmgr_thrd_t* reqmgr) +{ + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + + /* this will hold a list of client requests if we find any */ + arraylist_t* client_cbs = NULL; + + /* lock to access requests */ + RM_REQ_LOCK(reqmgr); + + /* if we have any requests, take pointer to the list + * of requests and replace it with a newly allocated + * list on the request manager structure */ + int num_client_cbs = arraylist_size(reqmgr->client_callbacks); + if (num_client_cbs) { + /* got some client requets, take the list and replace + * it with an empty list */ + LOGDBG("processing %d client callback requests", num_client_cbs); + client_cbs = reqmgr->client_callbacks; + reqmgr->client_callbacks = + arraylist_create(UNIFYFS_CLIENT_MAX_FILES); + } + + /* release lock on reqmgr requests */ + RM_REQ_UNLOCK(reqmgr); + + if (0 == num_client_cbs) { + return UNIFYFS_SUCCESS; + } + + /* sort the requests to make it easy to find duplicates */ + int rc = arraylist_sort(client_cbs, cb_arraylist_compare); + if (rc) { + LOGERR("failed to sort client callback arraylist"); + } + + /* iterate over each client request */ + int last_req_type = -1; + int last_req_app = -1; + int last_req_cli = -1; + int last_req_gfid = -1; + client_callback_req* req; + for (int i = 0; i < num_client_cbs; i++) { + /* process next request */ + int rret; + req = (client_callback_req*) arraylist_get(client_cbs, i); + if (NULL == req) { + continue; + } + + /* ignore duplicate callback reqs */ + if ((last_req_type == req->req_type) && + (last_req_app == req->app_id) && + (last_req_cli == req->client_id) && + (last_req_gfid == req->gfid)) { + continue; + } + last_req_type = req->req_type; + last_req_app = req->app_id; + last_req_cli = req->client_id; + last_req_gfid = req->gfid; + + switch (req->req_type) { + case UNIFYFS_CLIENT_CALLBACK_LAMINATE: + LOGERR("laminate callback not yet implemented"); + rret = UNIFYFS_ERROR_NYI; + break; + case UNIFYFS_CLIENT_CALLBACK_TRUNCATE: + LOGERR("truncate callback not yet implemented"); + rret = UNIFYFS_ERROR_NYI; + break; + case UNIFYFS_CLIENT_CALLBACK_UNLINK: + LOGDBG("unlink callback - client[%d:%d] gfid=%d", + req->app_id, req->client_id, req->gfid); + rret = invoke_client_unlink_callback_rpc(req->app_id, + req->client_id, + req->gfid); + break; + default: + LOGERR("unsupported client rpc request type %d", req->req_type); + rret = UNIFYFS_ERROR_NYI; + break; + } + if (rret != UNIFYFS_SUCCESS) { + if ((rret != ENOENT) && (rret != EEXIST)) { + LOGERR("client rpc request %d failed (%s)", + i, unifyfs_rc_enum_description(rret)); + } + ret = rret; + } + } + + /* free the list if we have one */ + if (NULL != client_cbs) { + /* NOTE: this will call free() on each req in the arraylist */ + arraylist_free(client_cbs); + } + + return ret; +} + /* submit a client rpc request to the request manager thread */ int rm_submit_client_rpc_request(unifyfs_fops_ctx_t* ctx, client_rpc_req_t* req) @@ -1223,7 +1413,6 @@ static int process_unlink_rpc(reqmgr_thrd_t* reqmgr, return ret; } - /* iterate over list of chunk reads and send responses */ static int rm_process_client_requests(reqmgr_thrd_t* reqmgr) { @@ -1365,6 +1554,12 @@ void* request_manager_thread(void* arg) * with main thread, new items inserted by the rpc handler */ int rc; while (1) { + /* process any client callback requests */ + rc = rm_process_client_callbacks(thrd_ctrl); + if (rc != UNIFYFS_SUCCESS) { + LOGWARN("failed to process client rpc requests"); + } + /* process any client requests */ rc = rm_process_client_requests(thrd_ctrl); if (rc != UNIFYFS_SUCCESS) { @@ -1386,8 +1581,7 @@ void* request_manager_thread(void* arg) /* grab lock */ RM_LOCK(thrd_ctrl); - /* inform dispatcher that we're waiting for work - * inside the critical section */ + /* set flag to indicate that we're waiting for work */ thrd_ctrl->waiting_for_work = 1; /* release lock and wait to be signaled by dispatcher */ diff --git a/server/src/unifyfs_request_manager.h b/server/src/unifyfs_request_manager.h index cc82caa6d..0bfba4079 100644 --- a/server/src/unifyfs_request_manager.h +++ b/server/src/unifyfs_request_manager.h @@ -35,6 +35,13 @@ #include "unifyfs_fops.h" #include "unifyfs_metadata_mdhim.h" +typedef struct { + client_callback_e req_type; + int app_id; + int client_id; + int gfid; +} client_callback_req; + typedef struct { client_rpc_e req_type; hg_handle_t handle; @@ -87,6 +94,9 @@ typedef struct reqmgr_thrd { /* list of client rpc requests */ arraylist_t* client_reqs; + /* list of client callback requests */ + arraylist_t* client_callbacks; + /* flag set to indicate request manager thread should exit */ int exit_flag; @@ -149,6 +159,15 @@ int rm_handle_chunk_read_responses(reqmgr_thrd_t* thrd_ctrl, */ int rm_submit_read_request(server_read_req_t* req); +/** + * @brief submit a client callback request to the request manager thread. + * + * @param req pointer to client callback request struct + * + * @return UNIFYFS_SUCCESS, or error code + */ +int rm_submit_client_callback_request(client_callback_req* req); + /** * @brief submit a client rpc request to the request manager thread. * diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index edbc63380..1f0628a60 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -53,6 +53,9 @@ size_t glb_num_servers; // size of glb_servers array unifyfs_cfg_t server_cfg; +/* arraylist to track failed clients */ +arraylist_t* failed_clients; // = NULL + static ABT_mutex app_configs_abt_sync; static app_config* app_configs[UNIFYFS_SERVER_MAX_NUM_APPS]; /* list of apps */ static size_t clients_per_app = UNIFYFS_SERVER_MAX_APP_CLIENTS; From dde11f7fb674a20b9629ecf7cf2ffc94978fedac Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Wed, 11 Aug 2021 09:19:22 -0400 Subject: [PATCH 38/81] fix undefined refs for inline logio functions --- common/src/unifyfs_logio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/src/unifyfs_logio.c b/common/src/unifyfs_logio.c index ed752adbd..945a8aa10 100644 --- a/common/src/unifyfs_logio.c +++ b/common/src/unifyfs_logio.c @@ -46,7 +46,7 @@ typedef struct log_header { /* chunk slot_map immediately follows header and occupies rest of the page */ // slot_map chunk_map; /* chunk slot_map that tracks reservations */ -inline void LOCK_LOG_HEADER(log_header* hdr) +static inline void LOCK_LOG_HEADER(log_header* hdr) { assert(NULL != hdr); while (hdr->updating) { @@ -55,7 +55,7 @@ inline void LOCK_LOG_HEADER(log_header* hdr) hdr->updating = 1; } -inline void UNLOCK_LOG_HEADER(log_header* hdr) +static inline void UNLOCK_LOG_HEADER(log_header* hdr) { assert(NULL != hdr); assert(hdr->updating); From 79c03556b88999ca2e36c5324a94e778a5b737bb Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Tue, 10 Aug 2021 16:34:41 -0400 Subject: [PATCH 39/81] fix undefined symbols in libunifyfs_api --- client/src/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/src/Makefile.am b/client/src/Makefile.am index fa46fdf7f..ac4573ee3 100644 --- a/client/src/Makefile.am +++ b/client/src/Makefile.am @@ -66,7 +66,6 @@ LIBRARY_API_SRC_FILES = \ CLIENT_CORE_SRC_FILES = \ $(UNIFYFS_COMMON_SRCS) \ - client_api.c \ client_read.c \ client_read.h \ client_transfer.c \ @@ -82,6 +81,7 @@ CLIENT_CORE_SRC_FILES = \ utlist.h POSIX_CLIENT_SRC_FILES = \ + client_api.c \ posix_client.c \ posix_client.h \ unifyfs-dirops.c \ From cdc22146f0e8ba5b0320ffe4f1c2b3fbdcfd7a63 Mon Sep 17 00:00:00 2001 From: CamStan Date: Thu, 12 Aug 2021 16:38:46 -0700 Subject: [PATCH 40/81] Testing: print failed tests at end of run Tweak sharness.sh to print a list of any tests that failed at the end of running to prevent needing to scroll through the tests to find failures. Move JOB_LAUNCH_COMMAND for running unit tests to ascent.yml in order to prevent hang on LC systems and since Ascent is the only system that needs to launch the unit tests this way. --- .gitlab-ci.yml | 2 -- .gitlab/ascent.yml | 2 ++ t/sharness.sh | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a5e520612..0d864977d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,7 +26,6 @@ stages: .slurm-single-node-template: variables: - JOB_LAUNCH_COMMAND: "srun -N1 -n1" LLNL_SLURM_SCHEDULER_PARAMETERS: "-N 1 -p $QUEUE -t $UNIT_WALL_TIME -J unifyfs-unit-tests" .slurm-multi-node-template: @@ -35,7 +34,6 @@ stages: .lsf-single-node-template: variables: - JOB_LAUNCH_COMMAND: "jsrun -r1 -n1" LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes 1 -q $QUEUE -W $UNIT_WALL_TIME -J unifyfs-unit-tests" SCHEDULER_PARAMETERS: "-nnodes 1 -P $PROJECT_ID -W $UNIT_WALL_TIME -J unifyfs-unit-tests" diff --git a/.gitlab/ascent.yml b/.gitlab/ascent.yml index b961cc741..8fee5129f 100644 --- a/.gitlab/ascent.yml +++ b/.gitlab/ascent.yml @@ -21,6 +21,8 @@ tags: [nobatch] .ascent-batch-template: + variables: + JOB_LAUNCH_COMMAND: "jsrun -r1 -n1" extends: .ascent-template tags: [batch] diff --git a/t/sharness.sh b/t/sharness.sh index a23888982..89318e199 100644 --- a/t/sharness.sh +++ b/t/sharness.sh @@ -158,6 +158,8 @@ test_fixed=0 test_broken=0 test_success=0 +declare -a failed_tests_list=() + die() { code=$? if test -n "$EXIT_OK"; then @@ -270,6 +272,7 @@ test_ok_() { test_failure_() { test_failure=$(($test_failure + 1)) say_color error "not ok $test_count - $1" + failed_tests_list+=("$test_count - $1") shift echo "$@" | sed -e 's/^/# /' test "$immediate" = "" || { EXIT_OK=t; exit 1; } @@ -784,6 +787,8 @@ test_done() { *) say_color error "# failed $test_failure among $msg" say "1..$test_count" + say "Failed tests list:" + printf '%s\n' "${failed_tests_list[@]}" exit 1 ;; From 1182c15a6c705d8706fd9f309299a435c267029d Mon Sep 17 00:00:00 2001 From: Rob Latham Date: Tue, 10 Aug 2021 08:30:18 -0500 Subject: [PATCH 41/81] pkg-config file for API option --- Makefile.am | 3 ++- client/unifyfs-api.pc.in | 12 ++++++++++++ configure.ac | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 client/unifyfs-api.pc.in diff --git a/Makefile.am b/Makefile.am index f8cf18283..d97516bf3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,7 +5,8 @@ CONFIG = ordered ACLOCAL_AMFLAGS = -I m4 pkgconfigdir = @pkgconfigdir@ -pkgconfig_DATA = client/unifyfs.pc +pkgconfig_DATA = client/unifyfs.pc \ + client/unifyfs-api.pc CLEANFILES = diff --git a/client/unifyfs-api.pc.in b/client/unifyfs-api.pc.in new file mode 100644 index 000000000..38f01314f --- /dev/null +++ b/client/unifyfs-api.pc.in @@ -0,0 +1,12 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: UnifyFS +Description: client library for UnifyFS unified burst buffer file system +Version: @LIBUNIFYFS_API_VERSION@ +Requires: +Libs: -L${libdir} -lunifyfs_api +Cflags: -I${includedir} + diff --git a/configure.ac b/configure.ac index 54b3daca4..3a70a0446 100755 --- a/configure.ac +++ b/configure.ac @@ -390,6 +390,7 @@ AC_CONFIG_FILES([Makefile client/Makefile client/src/Makefile client/unifyfs.pc + client/unifyfs-api.pc examples/Makefile examples/src/Makefile extras/Makefile From d31207032b32b18857ba7bce8faa9cb3da76ddef Mon Sep 17 00:00:00 2001 From: Rob Latham Date: Wed, 18 Aug 2021 11:40:40 -0500 Subject: [PATCH 42/81] Release allocated objects Valgrind flags these allocations as ==1655690== 10,486,400 (640 direct, 10,485,760 indirect) bytes in 1 blocks are definitely lost in loss record 35 of 35 ==1655690== at 0x483DD99: calloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so) ==1655690== by 0x4879CBC: unifyfs_dispatch_io (unifyfs_api_io.c:200) ==1655690== by 0x109662: main (unify-example.c:46) --- client/src/unifyfs_api_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/client/src/unifyfs_api_io.c b/client/src/unifyfs_api_io.c index 45ba123f7..17602a156 100644 --- a/client/src/unifyfs_api_io.c +++ b/client/src/unifyfs_api_io.c @@ -342,6 +342,10 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, break; } } + if (rd_reqs) free(rd_reqs); + if (wr_reqs) free (wr_reqs); + if (tr_reqs) free (tr_reqs); + if (s_reqs) free(s_reqs); return UNIFYFS_SUCCESS; } From 396e3c134e22b816322b9ec5d31998e81bacf3fc Mon Sep 17 00:00:00 2001 From: CamStan Date: Thu, 19 Aug 2021 11:24:28 -0700 Subject: [PATCH 43/81] Update project contact email Update the contact email for the project in docs and in the bug-report argument of the AC_INIT macro. --- configure.ac | 2 +- docs/contribute-ways.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 3a70a0446..84db84643 100755 --- a/configure.ac +++ b/configure.ac @@ -8,7 +8,7 @@ AC_LANG([C]) AC_INIT([unifyfs], m4_esyscmd([git describe --always | awk '/.*/{sub(/^v/,""); printf "%s",$1; exit}']), - [unifycr@llnl.gov]) + [ecp-unifyfs@exascaleproject.org]) AC_PREREQ(2.60) AC_CONFIG_SRCDIR([configure.ac]) AC_CONFIG_HEADERS([config.h]) diff --git a/docs/contribute-ways.rst b/docs/contribute-ways.rst index e44ccc13d..d0a22324f 100644 --- a/docs/contribute-ways.rst +++ b/docs/contribute-ways.rst @@ -125,5 +125,5 @@ anything you notice that needs to be changed. .. explicit external hyperlink targets -.. _mailing list: ecp-unifycr@exascaleproject.org +.. _mailing list: ecp-unifyfs@exascaleproject.org .. _issue tracker: https://github.com/LLNL/UnifyFS/issues From e7e0a32683390e9418ffda783741072336285675 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Mon, 30 Aug 2021 10:20:55 -0400 Subject: [PATCH 44/81] fix issue 675 --- server/src/unifyfs_p2p_rpc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/unifyfs_p2p_rpc.c b/server/src/unifyfs_p2p_rpc.c index 587798f83..94f0a2aa5 100644 --- a/server/src/unifyfs_p2p_rpc.c +++ b/server/src/unifyfs_p2p_rpc.c @@ -567,7 +567,8 @@ int unifyfs_invoke_find_extents_rpc(int gfid, unifyfs_file_attr_t attrs; int ret = sm_get_fileattr(gfid, &attrs); if (ret == UNIFYFS_SUCCESS) { - if (attrs.is_laminated || (owner_rank == glb_pmi_rank)) { + if ((owner_rank == glb_pmi_rank) || + (attrs.is_laminated && attrs.is_shared)) { /* do local lookup */ ret = sm_find_extents(gfid, (size_t)num_extents, extents, num_chunks, chunks); From 5725ede128890eb52b07853060e2b92bf79d2a89 Mon Sep 17 00:00:00 2001 From: CamStan Date: Mon, 30 Aug 2021 11:54:15 -0700 Subject: [PATCH 45/81] Integration tests: use unifyfs module if available Adjust UnifyFS integration tests to prefer the unifyfs module if it is a recognized command-line utility. Otherwise default to user provided UNIFYFS_INSTALL envar or a simple search if not provided. --- t/ci/001-setup.sh | 67 +++++++++++++++++----------------------- t/ci/002-start-server.sh | 10 ++++-- t/ci/990-stop-server.sh | 2 +- 3 files changed, 37 insertions(+), 42 deletions(-) diff --git a/t/ci/001-setup.sh b/t/ci/001-setup.sh index 59fd6aff0..921dd5468 100755 --- a/t/ci/001-setup.sh +++ b/t/ci/001-setup.sh @@ -97,27 +97,32 @@ source $UNIFYFS_CI_DIR/ci-functions.sh ########## Locate UnifyFS install and examples ########## -# Check if we have Spack and if UnifyFS is installed. -# If don't have both, fall back to checking for non-spack install. -# If neither, fail out. -# Set UNIFYFS_INSTALL to skip searching. -echo "$infomsg Looking for UnifyFS install directory..." - -# Look for UnifyFS install directory if the user didn't already set -# $UNIFYFS_INSTALL to the directory containing bin/ and libexec/ -if [[ -z $UNIFYFS_INSTALL ]]; then +# Check if unifyfs is loaded and recognized command-line utility; if so, use it +# If not, check if UNIFYFS_INSTALL was set to the install directory and use it +# If neither, do simple auto-search in $BASE_SEARCH_DIR to check for unifyfsd +# If none of the above, fail out. +if [[ -n $(which unifyfs 2>/dev/null) ]]; then + # If unifyfs is a loaded module, use it and set UNIFYFS_INSTALL + echo "$infomsg Using unifyfs module" + unifyfs_bin_dir=$(dirname "$(readlink -fm "$(which unifyfs)")") + UNIFYFS_INSTALL=$(dirname "$(readlink -fm $unifyfs_bin_dir)") + UNIFYFS_CLU="unifyfs" +elif [[ -n $UNIFYFS_INSTALL ]]; then + # $UNIFYFS_INSTALL directory was provided + UNIFYFS_BIN="$UNIFYFS_INSTALL/bin" + UNIFYFS_CLU="${UNIFYFS_BIN}/unifyfs" +elif [[ -z $UNIFYFS_INSTALL ]]; then + # Search $BASE_SEARCH_DIR for UnifyFS install directory if envar wasn't set + echo "$infomsg Searching for UnifyFS install directory..." # Search for unifyfsd starting in $BASE_SEARCH_DIR and omitting SPACK_ROOT unifyfsd_exe="$(find_executable $BASE_SEARCH_DIR "*/bin/unifyfsd"\ $SPACK_ROOT)" if [[ -x $unifyfsd_exe ]]; then # Set UNIFYFS_INSTALL to the dir containing bin/ and libexec/ UNIFYFS_INSTALL="$(dirname "$(dirname "$unifyfsd_exe")")" - # Else check for $SPACK_ROOT and if unifyfs is installed - elif [[ -n $SPACK_ROOT && -d $(spack location -i unifyfs 2>/dev/null) ]]; - then - # Might have a problem with variants and arch - UNIFYFS_INSTALL="$(spack location -i unifyfs)" - else + UNIFYFS_BIN="$UNIFYFS_INSTALL/bin" + UNIFYFS_CLU="${UNIFYFS_BIN}/unifyfs" + else # unifyfsd executable not found echo >&2 "$errmsg Unable to find UnifyFS install directory" echo >&2 "$errmsg Set \$UNIFYFS_INSTALL to the directory containing" \ "bin/ and libexec/ or \`spack install unifyfs\`" @@ -125,35 +130,19 @@ if [[ -z $UNIFYFS_INSTALL ]]; then fi fi -# Make sure UNIFYFS_INSTALL, bin/, and libexec/ exist -if [[ -d $UNIFYFS_INSTALL && -d ${UNIFYFS_INSTALL}/bin && - -d ${UNIFYFS_INSTALL}/libexec ]]; then - echo "$infomsg Found UnifyFS install directory: $UNIFYFS_INSTALL" - - UNIFYFS_BIN="$UNIFYFS_INSTALL/bin" +# Make sure UNIFYFS_INSTALL and libexec/ exist +if [[ -d $UNIFYFS_INSTALL && -d ${UNIFYFS_INSTALL}/libexec ]]; then UNIFYFS_EXAMPLES="$UNIFYFS_INSTALL/libexec" - echo "$infomsg Found UnifyFS bin directory: $UNIFYFS_BIN" - echo "$infomsg Found UnifyFS examples directory: $UNIFYFS_EXAMPLES" + echo "$infomsg Using UnifyFS install directory: $UNIFYFS_INSTALL" + echo "$infomsg Using UnifyFS command-line utilty: $UNIFYFS_CLU" + echo "$infomsg Using UnifyFS examples directory: $UNIFYFS_EXAMPLES" else - echo >&2 "$errmsg Ensure \$UNIFYFS_INSTALL exists and is the directory" \ + echo >&2 "$errmsg Load the unifyfs module or" \ + echo >&2 "$errmsg Ensure \$UNIFYFS_INSTALL is set and is the directory" \ "containing bin/ and libexec/" exit 1 fi -# Check for necessary Spack modules if Spack is detected -# Since GitLab Runners don't like this, just warn users running this by hand but -# don't fail out -if [[ -n $(which spack 2>/dev/null) ]]; then - loaded_modules=$(module list 2>&1) - modules="gotcha argobots mercury margo spath" - for mod in $modules; do - if ! [[ $(echo "$loaded_modules" | fgrep "$mod") ]]; then - echo "$errmsg $mod not detected. Please 'spack load $mod'" - fi - done -fi - - ########## Determine job launcher and source associated setup ########## # Source envar, functions, and set up JOB_RUN_COMMAND if lsf, slurm, or mpirun @@ -214,7 +203,7 @@ echo "$infomsg Set UNIFYFS_CI_TEMP_DIR to change both of these to same path" # storage nls=$nlt -export UNIFYFS_LOGIO_SPILL_SIZE=${UNIFYFS_LOGIO_SPILL_SIZE:-$((5 * GB))} +export UNIFYFS_LOGIO_SPILL_SIZE=${UNIFYFS_LOGIO_SPILL_SIZE:-$((16 * GB))} export UNIFYFS_LOGIO_SPILL_DIR=${UNIFYFS_LOGIO_SPILL_DIR:-$nls} echo "$infomsg UNIFYFS_LOGIO_SPILL_SIZE set as $UNIFYFS_LOGIO_SPILL_SIZE" echo "$infomsg UNIFYFS_LOGIO_SPILL_DIR set as $UNIFYFS_LOGIO_SPILL_DIR" diff --git a/t/ci/002-start-server.sh b/t/ci/002-start-server.sh index 88e834657..556d565eb 100755 --- a/t/ci/002-start-server.sh +++ b/t/ci/002-start-server.sh @@ -54,8 +54,14 @@ test_expect_success "unifyfsd hasn't started yet" ' process_is_not_running unifyfsd 10 ' -$UNIFYFS_BIN/unifyfs start -c -d -S $UNIFYFS_SHAREDFS_DIR \ - -e $UNIFYFS_BIN/unifyfsd &> ${UNIFYFS_LOG_DIR}/unifyfs.start.out +# UNIFYFS_BIN envar is set if not using unifyfs module +if [[ -n $UNIFYFS_BIN ]]; then + $UNIFYFS_CLU start -c -d -S $UNIFYFS_SHAREDFS_DIR \ + -e $UNIFYFS_BIN/unifyfsd &> ${UNIFYFS_LOG_DIR}/unifyfs.start.out +else + $UNIFYFS_CLU start -c -d -S $UNIFYFS_SHAREDFS_DIR \ + &> ${UNIFYFS_LOG_DIR}/unifyfs.start.out +fi test_expect_success "unifyfsd started" ' process_is_running unifyfsd 10 || diff --git a/t/ci/990-stop-server.sh b/t/ci/990-stop-server.sh index a27dd5672..79b7ed84f 100755 --- a/t/ci/990-stop-server.sh +++ b/t/ci/990-stop-server.sh @@ -24,7 +24,7 @@ test_expect_success "unifyfsd is still running" ' process_is_running unifyfsd 10 ' -$UNIFYFS_BIN/unifyfs terminate -d &> ${UNIFYFS_LOG_DIR}/unifyfs.terminate.out +$UNIFYFS_CLU terminate -d &> ${UNIFYFS_LOG_DIR}/unifyfs.terminate.out test_expect_success "unifyfsd has stopped" ' process_is_not_running unifyfsd 10 From 896f182fca996e08cb00eba05180772b09ba7b3c Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Tue, 31 Aug 2021 07:42:00 -0400 Subject: [PATCH 46/81] fix attempt to sync unlinked file --- client/src/client_read.c | 46 ++++++++++++------ client/src/client_transfer.c | 25 ++++++++-- client/src/unifyfs-sysio.c | 2 + client/src/unifyfs.c | 13 ++++++ client/src/unifyfs_api_internal.h | 5 +- client/src/unifyfs_fid.c | 77 ++++++++++++++++++++----------- 6 files changed, 124 insertions(+), 44 deletions(-) diff --git a/client/src/client_read.c b/client/src/client_read.c index 7824b2d55..d64373f2c 100644 --- a/client/src/client_read.c +++ b/client/src/client_read.c @@ -50,10 +50,13 @@ client_mread_status* client_create_mread_request(unifyfs_client* client, return NULL; } + pthread_mutex_lock(&(client->sync)); + int active_count = arraylist_size(client->active_mreads); if (active_count == arraylist_capacity(client->active_mreads)) { /* already at full capacity for outstanding reads */ LOGWARN("too many outstanding client reads"); + pthread_mutex_unlock(&(client->sync)); return NULL; } @@ -69,20 +72,23 @@ client_mread_status* client_create_mread_request(unifyfs_client* client, client_mread_status* mread = calloc(1, sizeof(client_mread_status)); if (NULL == mread) { LOGERR("failed to allocate client mread status"); - return NULL; + } else { + mread->client = client; + mread->id = mread_id; + mread->reqs = read_reqs; + mread->n_reads = (unsigned int) n_reads; + ABT_mutex_create(&(mread->sync)); + + int rc = arraylist_insert(client->active_mreads, + (int)req_ndx, (void*)mread); + if (rc != 0) { + ABT_mutex_free(&(mread->sync)); + free(mread); + mread = NULL; + } } - mread->client = client; - mread->id = mread_id; - mread->reqs = read_reqs; - mread->n_reads = (unsigned int) n_reads; - ABT_mutex_create(&(mread->sync)); - int rc = arraylist_insert(client->active_mreads, - (int)req_ndx, (void*)mread); - if (rc != 0) { - free(mread); - return NULL; - } + pthread_mutex_unlock(&(client->sync)); return mread; } @@ -101,16 +107,23 @@ int client_remove_mread_request(client_mread_status* mread) return EINVAL; } + int ret = UNIFYFS_SUCCESS; + + pthread_mutex_lock(&(client->sync)); + int list_index = (int) id_to_list_index(client, mread->id); void* list_item = arraylist_remove(client->active_mreads, list_index); if (list_item == (void*)mread) { ABT_mutex_free(&(mread->sync)); free(mread); - return UNIFYFS_SUCCESS; } else { LOGERR("mismatch on client->active_mreads index=%d", list_index); - return UNIFYFS_FAILURE; + ret = UNIFYFS_FAILURE; } + + pthread_mutex_unlock(&(client->sync)); + + return ret; } /* Retrieve the mread request corresponding to the given mread_id. */ @@ -122,6 +135,8 @@ client_mread_status* client_get_mread_status(unifyfs_client* client, return NULL; } + pthread_mutex_lock(&(client->sync)); + int list_index = (int) id_to_list_index(client, mread_id); void* list_item = arraylist_get(client->active_mreads, list_index); client_mread_status* status = (client_mread_status*)list_item; @@ -134,6 +149,9 @@ client_mread_status* client_get_mread_status(unifyfs_client* client, } else { LOGERR("lookup of mread status for id=%u failed", mread_id); } + + pthread_mutex_unlock(&(client->sync)); + return status; } diff --git a/client/src/client_transfer.c b/client/src/client_transfer.c index db5c1cfb4..99c530f87 100644 --- a/client/src/client_transfer.c +++ b/client/src/client_transfer.c @@ -89,10 +89,13 @@ int client_create_transfer(unifyfs_client* client, return EINVAL; } + pthread_mutex_lock(&(client->sync)); + int active_count = arraylist_size(client->active_transfers); if (active_count == arraylist_capacity(client->active_transfers)) { /* already at full capacity for outstanding reads */ LOGWARN("too many outstanding client transfers"); + pthread_mutex_unlock(&(client->sync)); return UNIFYFS_FAILURE; } @@ -108,6 +111,7 @@ int client_create_transfer(unifyfs_client* client, client_transfer_status* transfer = calloc(1, sizeof(*transfer)); if (NULL == transfer) { LOGERR("failed to allocate transfer status struct"); + pthread_mutex_unlock(&(client->sync)); return ENOMEM; } transfer->client = client; @@ -119,9 +123,14 @@ int client_create_transfer(unifyfs_client* client, int rc = arraylist_insert(client->active_transfers, (int)req_ndx, (void*)transfer); if (rc != 0) { + ABT_mutex_free(&(transfer->sync)); free(transfer); + pthread_mutex_unlock(&(client->sync)); return rc; } + + pthread_mutex_unlock(&(client->sync)); + req->_reqid = transfer_id; debug_print_transfer_req(req); @@ -137,13 +146,16 @@ client_transfer_status* client_get_transfer(unifyfs_client* client, return NULL; } + pthread_mutex_lock(&(client->sync)); + int list_index = (int) id_to_list_index(client, transfer_id); void* list_item = arraylist_get(client->active_transfers, list_index); if (list_item == NULL) { LOGERR("client->active_transfers index=%d is NULL", list_index); - return NULL; } + pthread_mutex_unlock(&(client->sync)); + client_transfer_status* transfer = list_item; return transfer; } @@ -206,16 +218,23 @@ int client_cleanup_transfer(unifyfs_client* client, } } + int ret = UNIFYFS_SUCCESS; + + pthread_mutex_lock(&(client->sync)); + int list_index = (int) id_to_list_index(client, req->_reqid); void* list_item = arraylist_remove(client->active_transfers, list_index); if (list_item == (void*)transfer) { ABT_mutex_free(&(transfer->sync)); free(transfer); - return UNIFYFS_SUCCESS; } else { LOGERR("mismatch on client->active_transfers index=%d", list_index); - return UNIFYFS_FAILURE; + ret = UNIFYFS_FAILURE; } + + pthread_mutex_unlock(&(client->sync)); + + return ret; } /* Update the transfer status for the client (app_id + client_id) diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index 99a69159b..bd28177cd 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -594,9 +594,11 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) } /* finally overwrite the old name with the new name */ + pthread_mutex_lock(&(posix_client->sync)); LOGDBG("Changing %s to %s", (char*)posix_client->unifyfs_filelist[fid].filename, new_upath); strcpy((void*)posix_client->unifyfs_filelist[fid].filename, new_upath); + pthread_mutex_unlock(&(posix_client->sync)); /* success */ return 0; diff --git a/client/src/unifyfs.c b/client/src/unifyfs.c index 070d85650..f4909c60d 100644 --- a/client/src/unifyfs.c +++ b/client/src/unifyfs.c @@ -266,6 +266,7 @@ static int init_superblock_structures(unifyfs_client* client) for (int i = 0; i < client->max_files; i++) { /* indicate that file id is not in use by setting flag to 0 */ client->unifyfs_filelist[i].in_use = 0; + client->unifyfs_filelist[i].filename[0] = 0; } /* initialize stack of free file ids */ @@ -439,6 +440,11 @@ int unifyfs_client_init(unifyfs_client* client) return UNIFYFS_FAILURE; } + pthread_mutexattr_t mux_recursive; + pthread_mutexattr_init(&mux_recursive); + pthread_mutexattr_settype(&mux_recursive, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&(client->sync), &mux_recursive); + /* remember that we've now initialized the library */ client->state.initialized = 1; } @@ -473,6 +479,8 @@ int unifyfs_client_fini(unifyfs_client* client) return UNIFYFS_FAILURE; } + pthread_mutex_lock(&(client->sync)); + if (NULL != client->active_mreads) { arraylist_free(client->active_mreads); } @@ -481,6 +489,9 @@ int unifyfs_client_fini(unifyfs_client* client) arraylist_free(client->active_transfers); } + pthread_mutex_unlock(&(client->sync)); + pthread_mutex_destroy(&(client->sync)); + /* close spillover files */ if (NULL != client->state.logio_ctx) { unifyfs_logio_close(client->state.logio_ctx, 0); @@ -530,6 +541,7 @@ int unifyfs_sync_files(unifyfs_client* client) int ret = UNIFYFS_SUCCESS; /* sync every active file */ + pthread_mutex_lock(&(client->sync)); for (int i = 0; i < client->max_files; i++) { if (client->unifyfs_filelist[i].in_use) { /* got an active file, so sync this file id */ @@ -539,6 +551,7 @@ int unifyfs_sync_files(unifyfs_client* client) } } } + pthread_mutex_unlock(&(client->sync)); return ret; } diff --git a/client/src/unifyfs_api_internal.h b/client/src/unifyfs_api_internal.h index 934531f9d..3431be430 100644 --- a/client/src/unifyfs_api_internal.h +++ b/client/src/unifyfs_api_internal.h @@ -53,7 +53,7 @@ typedef struct { int in_use; /* full path and name of file */ - const char filename[UNIFYFS_MAX_FILENAME]; + char filename[UNIFYFS_MAX_FILENAME]; } unifyfs_filename_t; /* UnifyFS file system client structure */ @@ -77,6 +77,9 @@ typedef struct unifyfs_client { /* tracks current working directory within namespace */ char* cwd; + /* mutex for synchronizing updates to below state */ + pthread_mutex_t sync; + /* an arraylist to maintain the active mread requests for the client */ arraylist_t* active_mreads; unsigned int mread_id_generator; /* to generate unique mread ids */ diff --git a/client/src/unifyfs_fid.c b/client/src/unifyfs_fid.c index fbbd98a14..b4dc7bf2f 100644 --- a/client/src/unifyfs_fid.c +++ b/client/src/unifyfs_fid.c @@ -50,8 +50,8 @@ int unifyfs_fid_free(unifyfs_client* client, * --------------------------------------- */ /* allocate and initialize data management resource for file */ -static int fid_store_alloc(unifyfs_client* client, - int fid) +static int fid_storage_alloc(unifyfs_client* client, + int fid) { /* get meta data for this file */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); @@ -202,9 +202,12 @@ int unifyfs_fid_create_file(unifyfs_client* client, return -ENAMETOOLONG; } + pthread_mutex_lock(&(client->sync)); + /* allocate an id for this file */ int fid = unifyfs_fid_alloc(client); if (fid < 0) { + pthread_mutex_unlock(&(client->sync)); return fid; } @@ -217,6 +220,8 @@ int unifyfs_fid_create_file(unifyfs_client* client, LOGDBG("Filename %s got unifyfs fid %d", client->unifyfs_filelist[fid].filename, fid); + pthread_mutex_unlock(&(client->sync)); + /* get metadata for this file id */ unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); assert(meta != NULL); @@ -332,25 +337,32 @@ int unifyfs_fid_create_directory(unifyfs_client* client, int unifyfs_fid_delete(unifyfs_client* client, int fid) { + pthread_mutex_lock(&(client->sync)); + + /* set this file id as not in use */ + client->unifyfs_filelist[fid].in_use = 0; + client->unifyfs_filelist[fid].filename[0] = 0; + /* finalize the storage we're using for this file */ int rc = fid_storage_free(client, fid); if (rc != UNIFYFS_SUCCESS) { /* failed to release structures tracking storage, * bail out to keep its file id active */ + pthread_mutex_unlock(&(client->sync)); return rc; } - /* set this file id as not in use */ - client->unifyfs_filelist[fid].in_use = 0; - /* add this id back to the free stack */ rc = unifyfs_fid_free(client, fid); if (rc != UNIFYFS_SUCCESS) { /* storage for the file was released, but we hit * an error while freeing the file id */ + pthread_mutex_unlock(&(client->sync)); return rc; } + pthread_mutex_unlock(&(client->sync)); + return UNIFYFS_SUCCESS; } @@ -432,7 +444,7 @@ int unifyfs_fid_open( } /* initialize local storage for this file */ - ret = fid_store_alloc(client, fid); + ret = fid_storage_alloc(client, fid); if (ret != UNIFYFS_SUCCESS) { LOGERR("failed to allocate storage space for file %s (fid=%d)", path, fid); @@ -513,7 +525,7 @@ int unifyfs_fid_open( } /* initialize local storage for this file */ - ret = fid_store_alloc(client, fid); + ret = fid_storage_alloc(client, fid); if (ret != UNIFYFS_SUCCESS) { LOGERR("failed to allocate storage space for file %s (fid=%d)", path, fid); @@ -645,7 +657,9 @@ unifyfs_filemeta_t* unifyfs_get_meta_from_fid(unifyfs_client* client, /* check that the file id is within range of our array */ if (fid >= 0 && fid < client->max_files) { /* get a pointer to the file meta data structure */ + pthread_mutex_lock(&(client->sync)); unifyfs_filemeta_t* meta = &(client->unifyfs_filemetas[fid]); + pthread_mutex_unlock(&(client->sync)); if (fid == meta->fid) { /* before returning metadata, process any pending callbacks */ @@ -656,6 +670,7 @@ unifyfs_filemeta_t* unifyfs_get_meta_from_fid(unifyfs_client* client, if (UNIFYFS_SUCCESS != rc) { LOGERR("fid delete failed"); } + return NULL; /* we just deleted it */ } } return meta; @@ -696,8 +711,8 @@ int unifyfs_fid_is_dir(unifyfs_client* client, int unifyfs_fid_is_dir_empty(unifyfs_client* client, const char* path) { - int i = 0; - while (i < client->max_files) { + pthread_mutex_lock(&(client->sync)); + for (int i = 0; i < client->max_files; i++) { /* only check this element if it's active */ if (client->unifyfs_filelist[i].in_use) { /* if the file starts with the path, it is inside of that directory @@ -708,13 +723,12 @@ int unifyfs_fid_is_dir_empty(unifyfs_client* client, /* found a child item in path */ LOGDBG("File found: unifyfs_filelist[%d].filename = %s", i, (char*)&client->unifyfs_filelist[i].filename); + pthread_mutex_unlock(&(client->sync)); return 0; } } - - /* go on to next file */ - i++; } + pthread_mutex_unlock(&(client->sync)); /* couldn't find any files with this prefix, dir must be empty */ return 1; @@ -742,15 +756,17 @@ int unifyfs_gfid_from_fid(unifyfs_client* client, int unifyfs_fid_from_gfid(unifyfs_client* client, int gfid) { - int i; - for (i = 0; i < client->max_files; i++) { + pthread_mutex_lock(&(client->sync)); + for (int i = 0; i < client->max_files; i++) { if (client->unifyfs_filelist[i].in_use && client->unifyfs_filemetas[i].attrs.gfid == gfid) { /* found a file id that's in use and it matches * the target fid, this is the one */ + pthread_mutex_unlock(&(client->sync)); return i; } } + pthread_mutex_unlock(&(client->sync)); return -1; } @@ -758,10 +774,13 @@ int unifyfs_fid_from_gfid(unifyfs_client* client, const char* unifyfs_path_from_fid(unifyfs_client* client, int fid) { + pthread_mutex_lock(&(client->sync)); unifyfs_filename_t* fname = &client->unifyfs_filelist[fid]; if (fname->in_use) { + pthread_mutex_unlock(&(client->sync)); return fname->filename; } + pthread_mutex_unlock(&(client->sync)); return NULL; } @@ -771,16 +790,19 @@ int unifyfs_fid_from_path(unifyfs_client* client, { /* scan through active entries in filelist array looking * for a match of path */ - int i = 0; - while (i < client->max_files && client->unifyfs_filelist[i].in_use) { - const char* filename = client->unifyfs_filelist[i].filename; - if (0 == strcmp(filename, path)) { - LOGDBG("File found: unifyfs_filelist[%d].filename = %s", - i, (char*)filename); - return i; + pthread_mutex_lock(&(client->sync)); + for (int i = 0; i < client->max_files; i++) { + if (client->unifyfs_filelist[i].in_use) { + const char* filename = client->unifyfs_filelist[i].filename; + if (0 == strcmp(filename, path)) { + LOGDBG("File found: unifyfs_filelist[%d].filename = %s", + i, (char*)filename); + pthread_mutex_unlock(&(client->sync)); + return i; + } } - i++; } + pthread_mutex_unlock(&(client->sync)); /* couldn't find specified path */ return -1; @@ -1048,16 +1070,19 @@ int unifyfs_fid_sync_data(unifyfs_client* client, int unifyfs_fid_sync_extents(unifyfs_client* client, int fid) { - /* assume we'll succeed */ - int ret = UNIFYFS_SUCCESS; - unifyfs_filemeta_t* meta = unifyfs_get_meta_from_fid(client, fid); - if ((NULL == meta) || (meta->fid != fid)) { + if (NULL == meta) { + LOGDBG("no filemeta for fid=%d", fid); + return UNIFYFS_SUCCESS; + } else if(meta->fid != fid) { /* bail out with an error if we fail to find it */ LOGERR("missing filemeta for fid=%d", fid); return UNIFYFS_FAILURE; } + /* assume we'll succeed */ + int ret = UNIFYFS_SUCCESS; + /* sync with server if we need to */ if (meta->needs_sync) { int rc; From b9067d336bef4e72e65aadaca953c2f448428fa5 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Thu, 2 Sep 2021 14:26:57 -0400 Subject: [PATCH 47/81] fix for failed reads when client.local_extents=1 --- client/src/client_read.c | 162 +++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 75 deletions(-) diff --git a/client/src/client_read.c b/client/src/client_read.c index d64373f2c..8d70d53f2 100644 --- a/client/src/client_read.c +++ b/client/src/client_read.c @@ -179,8 +179,6 @@ int client_update_mread_request(client_mread_status* mread, mread->n_error++; rdreq->nread = 0; rdreq->errcode = req_error; - } else { - rdreq->nread = rdreq->cover_end_offset + 1; } LOGINFO("updating mread[%u] status for request %u of %u " "(n_complete=%u, n_error=%u)", @@ -288,6 +286,7 @@ void update_read_req_coverage(read_req_t* req, if ((req->cover_end_offset == (size_t)-1) || (end_byte_offset > req->cover_end_offset)) { req->cover_end_offset = end_byte_offset; + req->nread = req->cover_end_offset + 1; } } @@ -473,6 +472,83 @@ int compare_read_req(const void* a, const void* b) } } +static void update_read_req_result(unifyfs_client* client, + read_req_t* req) +{ + debug_print_read_req(req); + + /* no error message was set, assume success */ + if (req->errcode == EINPROGRESS) { + req->errcode = UNIFYFS_SUCCESS; + } + + /* if we hit an error on our read, nothing else to do */ + if ((req->errcode != UNIFYFS_SUCCESS) && + (req->errcode != ENODATA)) { + return; + } + + /* if we read all of the bytes, request is satisfied */ + if (req->nread == req->length) { + /* check for read hole at beginning of request */ + if (req->cover_begin_offset != 0) { + /* fill read hole at beginning of request */ + LOGDBG("zero-filling hole at offset %zu of length %zu", + req->offset, req->cover_begin_offset); + memset(req->buf, 0, req->cover_begin_offset); + req->errcode = UNIFYFS_SUCCESS; + } + return; + } + + /* get file size for this file */ + off_t filesize_offt = unifyfs_gfid_filesize(client, req->gfid); + if (filesize_offt == (off_t)-1) { + /* failed to get file size */ + req->errcode = ENOENT; + return; + } + size_t filesize = (size_t) filesize_offt; + + if (filesize <= req->offset) { + /* request start offset is at or after EOF */ + req->nread = 0; + req->errcode = UNIFYFS_SUCCESS; + } else { + /* otherwise, we have a short read, check whether there + * would be a hole after us, in which case we fill the + * request buffer with zeros */ + + /* get offset of where hole starts */ + size_t gap_start = req->offset + req->nread; + + /* get last offset of the read request */ + size_t req_end = req->offset + req->length; + + /* if file size is larger than last offset we wrote to in + * read request, then there is a hole we can fill */ + if (filesize > gap_start) { + /* assume we can fill the full request with zero */ + size_t gap_length = req_end - gap_start; + if (req_end > filesize) { + /* request is trying to read past end of file, + * so only fill zeros up to end of file */ + gap_length = filesize - gap_start; + } + + /* copy zeros into request buffer */ + LOGDBG("zero-filling hole at offset %zu of length %zu", + gap_start, gap_length); + char* req_ptr = req->buf + req->nread; + memset(req_ptr, 0, gap_length); + + /* update number of bytes read and request status */ + req->nread += gap_length; + req->errcode = UNIFYFS_SUCCESS; + } + } +} + /** * Service a list of client read requests using either local * data or forwarding requests to the server. @@ -500,6 +576,7 @@ int process_gfid_reads(unifyfs_client* client, int ret = UNIFYFS_SUCCESS; /* assume we'll service all requests from the server */ + int local_count = 0; int server_count = in_count; read_req_t* server_reqs = in_reqs; read_req_t* local_reqs = NULL; @@ -543,6 +620,13 @@ int process_gfid_reads(unifyfs_client* client, * to be processed by the server */ service_local_reqs(client, in_reqs, in_count, local_reqs, server_reqs, &server_count); + local_count = in_count - server_count; + for (i = 0; i < local_count; i++) { + /* get pointer to next read request */ + read_req_t* req = local_reqs + i; + LOGDBG("local request %d:", i); + update_read_req_result(client, req); + } /* return early if we satisfied all requests locally */ if (server_count == 0) { @@ -653,78 +737,7 @@ int process_gfid_reads(unifyfs_client* client, /* get pointer to next read request */ read_req_t* req = server_reqs + i; LOGDBG("mread[%u] server request %d:", mread->id, i); - debug_print_read_req(req); - - /* no error message was received from server, assume success */ - if (req->errcode == EINPROGRESS) { - req->errcode = UNIFYFS_SUCCESS; - } - - /* if we hit an error on our read, nothing else to do */ - if ((req->errcode != UNIFYFS_SUCCESS) && - (req->errcode != ENODATA)) { - continue; - } - - /* if we read all of the bytes, request is satisfied */ - if (req->nread == req->length) { - /* check for read hole at beginning of request */ - if (req->cover_begin_offset != 0) { - /* fill read hole at beginning of request */ - LOGDBG("zero-filling hole at offset %zu of length %zu", - req->offset, req->cover_begin_offset); - memset(req->buf, 0, req->cover_begin_offset); - req->errcode = UNIFYFS_SUCCESS; - } - continue; - } - - /* get file size for this file */ - off_t filesize_offt = unifyfs_gfid_filesize(client, req->gfid); - if (filesize_offt == (off_t)-1) { - /* failed to get file size */ - req->errcode = ENOENT; - continue; - } - size_t filesize = (size_t)filesize_offt; - - if (filesize <= req->offset) { - /* request start offset is at or after EOF */ - req->nread = 0; - req->errcode = UNIFYFS_SUCCESS; - } else { - /* otherwise, we have a short read, check whether there - * would be a hole after us, in which case we fill the - * request buffer with zeros */ - - /* get offset of where hole starts */ - size_t gap_start = req->offset + req->nread; - - /* get last offset of the read request */ - size_t req_end = req->offset + req->length; - - /* if file size is larger than last offset we wrote to in - * read request, then there is a hole we can fill */ - if (filesize > gap_start) { - /* assume we can fill the full request with zero */ - size_t gap_length = req_end - gap_start; - if (req_end > filesize) { - /* request is trying to read past end of file, - * so only fill zeros up to end of file */ - gap_length = filesize - gap_start; - } - - /* copy zeros into request buffer */ - LOGDBG("zero-filling hole at offset %zu of length %zu", - gap_start, gap_length); - char* req_ptr = req->buf + req->nread; - memset(req_ptr, 0, gap_length); - - /* update number of bytes read and request status */ - req->nread += gap_length; - req->errcode = UNIFYFS_SUCCESS; - } - } + update_read_req_result(client, req); } /* if we attempted to service requests from our local extent map, @@ -735,7 +748,6 @@ int process_gfid_reads(unifyfs_client* client, * in which we received them. */ /* copy locally completed requests back into user's array */ - int local_count = in_count - server_count; if (local_count > 0) { memcpy(in_reqs, local_reqs, local_count * sizeof(read_req_t)); } From d8de7baf7056a0912855d260a293e87635edcb49 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Tue, 7 Sep 2021 16:14:40 -0400 Subject: [PATCH 48/81] initialize unifyfs_io_request result and state in dispatch --- client/src/unifyfs_api_io.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/client/src/unifyfs_api_io.c b/client/src/unifyfs_api_io.c index 17602a156..37ed288b6 100644 --- a/client/src/unifyfs_api_io.c +++ b/client/src/unifyfs_api_io.c @@ -163,6 +163,13 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, size_t n_sync = 0; for (size_t i = 0; i < nreqs; i++) { req = reqs + i; + + /* set initial request result and state */ + req->state = UNIFYFS_IOREQ_STATE_INVALID; + req->result.error = UNIFYFS_SUCCESS; + req->result.count = 0; + req->result.rc = 0; + switch (req->op) { case UNIFYFS_IOREQ_NOP: break; @@ -182,6 +189,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, break; default: LOGERR("invalid ioreq operation"); + req->result.error = EINVAL; return EINVAL; } } From bc7a4bc024446e4cb58fdff79213eede78df94c4 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Wed, 8 Sep 2021 11:27:52 -0400 Subject: [PATCH 49/81] Fix various memory leaks Fix a variety of individual memory leaks: * Removed 'static' descriptor from unifyfs_inode_destroy() so it could be called from unifyfs_inode_tree_clear() and then call it from unifyfs_inode_tree_clear() instead of free'ing components of the inodes manually. * Error handling conditions in unifyfs_rm_thrd_create() didn't always free everything they should. * In process_metaset_rpc(), properly free requests created by create_mountpoint_dir(). (See detailed comments in the code.) * In unifyfs_exit(), free the appconfigs_abt_sync mutex and finalize the config variables. * In svcmgr_fini(), free the sm->reqs_sync mutex. * In cleanup_app_client(), before freeing the reqmgr struct, free the reqmgr->client_reqs and reqmgr->client_callbacks arraylists and the reqmgr->reqs_sync mutex. * In margo_server_rpc_finalize(), free the server_infos struct. --- server/src/margo_server.c | 2 ++ server/src/unifyfs_inode.c | 3 --- server/src/unifyfs_inode.h | 9 +++++++++ server/src/unifyfs_inode_tree.c | 12 ++++++------ server/src/unifyfs_request_manager.c | 24 ++++++++++++++++++++++++ server/src/unifyfs_server.c | 25 +++++++++++++++++++++++++ server/src/unifyfs_service_manager.c | 6 ++++++ 7 files changed, 72 insertions(+), 9 deletions(-) diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 412d1f046..2f2eb218f 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -446,6 +446,8 @@ int margo_server_rpc_finalize(void) } } + free(server_infos); + /* shut down margo */ LOGDBG("finalizing server-server margo"); margo_finalize(ctx->svr_mid); diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c index 6bd31a4ac..4397d847c 100644 --- a/server/src/unifyfs_inode.c +++ b/server/src/unifyfs_inode.c @@ -51,8 +51,6 @@ struct unifyfs_inode* unifyfs_inode_alloc(int gfid, unifyfs_file_attr_t* attr) return ino; } -static int unifyfs_inode_destroy(struct unifyfs_inode* ino); - /** * @brief read lock the inode for ro access. * @@ -122,7 +120,6 @@ static int int_cmp_fn(const void* a, const void* b) return ai - bi; } -static int unifyfs_inode_destroy(struct unifyfs_inode* ino) { int ret = UNIFYFS_SUCCESS; diff --git a/server/src/unifyfs_inode.h b/server/src/unifyfs_inode.h index e2412e4ac..0ee7c85f9 100644 --- a/server/src/unifyfs_inode.h +++ b/server/src/unifyfs_inode.h @@ -54,6 +54,15 @@ struct unifyfs_inode { */ int unifyfs_inode_create(int gfid, unifyfs_file_attr_t* attr); +/** + * @brief delete an inode and all its contents + * + * @param ino inode to destroy + * + * @return 0 on success, errno otherwise + */ +int unifyfs_inode_destroy(struct unifyfs_inode* ino); + /** * @brief update the attributes of file with @gfid. The attributes are * selectively updated with unifyfs_file_attr_update() function (see diff --git a/server/src/unifyfs_inode_tree.c b/server/src/unifyfs_inode_tree.c index ba38391d7..039174e89 100644 --- a/server/src/unifyfs_inode_tree.c +++ b/server/src/unifyfs_inode_tree.c @@ -189,19 +189,19 @@ void unifyfs_inode_tree_clear( while ((node = unifyfs_inode_tree_iter(tree, node))) { if (oldnode) { RB_REMOVE(rb_inode_tree, &tree->head, oldnode); - if (oldnode->extents != NULL) { - extent_tree_destroy(oldnode->extents); + int rc = unifyfs_inode_destroy(oldnode); + if (rc) { + LOGERR("Error %d from unifyfs_inode_destroy()", rc); } - free(oldnode); } oldnode = node; } if (oldnode) { RB_REMOVE(rb_inode_tree, &tree->head, oldnode); - if (oldnode->extents != NULL) { - extent_tree_destroy(oldnode->extents); + int rc = unifyfs_inode_destroy(oldnode); + if (rc) { + LOGERR("Error %d from unifyfs_inode_destroy()", rc); } - free(oldnode); } unifyfs_inode_tree_unlock(tree); diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index 864364cae..af075ad0d 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -134,6 +134,8 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) arraylist_create(UNIFYFS_CLIENT_MAX_ACTIVE_REQUESTS); if (thrd_ctrl->client_reqs == NULL) { LOGERR("failed to allocate request manager client_reqs!"); + ABT_mutex_free(&(thrd_ctrl->reqs_sync)); + pthread_cond_destroy(&(thrd_ctrl->thrd_cond)); pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); free(thrd_ctrl); return NULL; @@ -144,6 +146,9 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) arraylist_create(UNIFYFS_CLIENT_MAX_FILES); if (thrd_ctrl->client_callbacks == NULL) { LOGERR("failed to allocate request manager client_callbacks!"); + arraylist_free(thrd_ctrl->client_reqs); + ABT_mutex_free(&(thrd_ctrl->reqs_sync)); + pthread_cond_destroy(&(thrd_ctrl->thrd_cond)); pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); free(thrd_ctrl); return NULL; @@ -165,6 +170,9 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) LOGERR("failed to create request manager thread for " "app_id=%d client_id=%d - rc=%d (%s)", app_id, client_id, rc, strerror(rc)); + arraylist_free(thrd_ctrl->client_callbacks); + arraylist_free(thrd_ctrl->client_reqs); + ABT_mutex_free(&(thrd_ctrl->reqs_sync)); pthread_cond_destroy(&(thrd_ctrl->thrd_cond)); pthread_mutex_destroy(&(thrd_ctrl->thrd_lock)); free(thrd_ctrl); @@ -1227,8 +1235,24 @@ static int process_metaset_rpc(reqmgr_thrd_t* reqmgr, if (NULL != in->attr.filename) { fattr.filename = strdup(in->attr.filename); } + + /* This is somewhat ugly: if the input came via a standard Mercury RPC, + * then req->handle will exist and margo_free_input() will clean up 'in' + * correctly. + * + * *HOWEVER*, there's one case where we'll end up here without going + * through an RPC: the request created by create_mountpoint_dir() is + * created locally. More specifically, margo_get_input() is not used to + * create the 'in' struct and thus margo_free_input() should not be + * called. That said, in->attr.filename is allocated with strdup(), and + * must therefore be freed before we free 'in'. + */ if (HG_HANDLE_NULL != req->handle) { margo_free_input(req->handle, in); + } else { + if (NULL != in->attr.filename) { + free(in->attr.filename); + } } free(in); diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index 1f0628a60..96ccf40d7 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -643,6 +643,10 @@ static int find_rank_idx(int my_rank) static int unifyfs_exit(void) { int ret = UNIFYFS_SUCCESS; + /* Note: ret could potentially get overwritten a few times. Since this + * is shutdown/cleanup code, we'll do as much cleanup as we can and just + * return the most recent value of ret. + */ /* iterate over each active application and free resources */ LOGDBG("cleaning application state"); @@ -670,6 +674,11 @@ static int unifyfs_exit(void) LOGDBG("finalizing kvstore service"); unifyfs_keyval_fini(); + ret = ABT_mutex_free(&app_configs_abt_sync); + if (ret != ABT_SUCCESS) { + LOGERR("Error returned from ABT_mutex_free(): %d", ret); + } + /* shutdown rpc service * (note: this needs to happen after app-client cleanup above) */ LOGDBG("stopping rpc service"); @@ -686,6 +695,13 @@ static int unifyfs_exit(void) MPI_Finalize(); #endif + /* Finalize the config variables */ + LOGDBG("Finalizing config variables"); + ret = unifyfs_config_fini(&server_cfg); + if (ret != ABT_SUCCESS) { + LOGERR("Error returned from unifyfs_config_fini(): %d", ret); + } + LOGDBG("all done!"); unifyfs_log_close(); @@ -1036,6 +1052,15 @@ unifyfs_rc cleanup_app_client(app_config* app, app_client* client) /* free client structure */ if (NULL != client->reqmgr) { + if (NULL != client->reqmgr->client_reqs) { + arraylist_free(client->reqmgr->client_reqs); + } + if (NULL != client->reqmgr->client_callbacks) { + arraylist_free(client->reqmgr->client_callbacks); + } + + ABT_mutex_free(&(client->reqmgr->reqs_sync)); + free(client->reqmgr); client->reqmgr = NULL; } diff --git a/server/src/unifyfs_service_manager.c b/server/src/unifyfs_service_manager.c index b16840247..96e4fce6c 100644 --- a/server/src/unifyfs_service_manager.c +++ b/server/src/unifyfs_service_manager.c @@ -273,6 +273,12 @@ int svcmgr_fini(void) arraylist_free(sm->svc_reqs); } + int abt_err = ABT_mutex_free(&(sm->reqs_sync)); + if (ABT_SUCCESS != abt_err) { + /* All we can really do here is log the error */ + LOGERR("Error code returned from ABT_mutex_free(): %d", abt_err); + } + if (sm->initialized) { pthread_mutex_destroy(&(sm->thrd_lock)); pthread_cond_destroy(&(sm->thrd_cond)); From 31366197c14f15e67fd99fb311f0256f252e2963 Mon Sep 17 00:00:00 2001 From: CamStan Date: Wed, 8 Sep 2021 16:06:28 -0700 Subject: [PATCH 50/81] Add stage/transfer API tests to CI framework Creates lower-level tests (t/ci/800-stage-tests.sh) to do a proof of concept of the transfer API. Also creates a higher-level file (t/ci/RUN_CI_STAGE_TESTS.sh) that runs the lower-level tests over a sweep of file sizes and server configurations. Added a parallel transfer test to t/ci/800-stage-tests.sh as well. It is currently being skipped until the parallel transfer logic is fixed. Fixes a bug in serial transfer logic causing the destination file to be created as read-only, resulting in a failed transfer. Update transfer docs to account for the various methods. --- client/src/posix_client.c | 18 ++- docs/run.rst | 242 ++++++++++++++++++++++++++++------- t/ci/800-stage-tests.sh | 249 +++++++++++++++++++++++++++++++++++++ t/ci/990-stop-server.sh | 70 +++++++---- t/ci/RUN_CI_STAGE_TESTS.sh | 101 +++++++++++++++ t/ci/RUN_CI_TESTS.sh | 8 +- 6 files changed, 610 insertions(+), 78 deletions(-) create mode 100755 t/ci/800-stage-tests.sh create mode 100755 t/ci/RUN_CI_STAGE_TESTS.sh diff --git a/client/src/posix_client.c b/client/src/posix_client.c index dbdbfea99..a36d40dca 100644 --- a/client/src/posix_client.c +++ b/client/src/posix_client.c @@ -872,12 +872,24 @@ int unifyfs_transfer_file(const char* src, return -EINVAL; } - /* for both serial and parallel transfers, use rank 0 client to - * create the destination file using the source file's mode */ + /* TODO: Fix parallel transfer logic + * for both serial and parallel transfers, use rank 0 client to + * create the destination file */ if (0 == client_rank) { errno = 0; int create_flags = O_CREAT | O_WRONLY | O_TRUNC; - int fd = UNIFYFS_WRAP(open)(dst_path, create_flags, sb_src.st_mode); + int dst_mode; + + if (unify_src) { + /* Destination file needs to be writable; file in UnifyFS may have + * been laminated */ + dst_mode = 0640; + } else { + /* Use the source file's mode */ + dst_mode = sb_src.st_mode; + } + + int fd = UNIFYFS_WRAP(open)(dst_path, create_flags, dst_mode); err = errno; if (fd < 0) { LOGERR("failed to create destination file %s", dst_path); diff --git a/docs/run.rst b/docs/run.rst index 6574ed650..b4f67a4f4 100644 --- a/docs/run.rst +++ b/docs/run.rst @@ -17,9 +17,9 @@ Overall, the steps to run an application with UnifyFS include: 5. Terminate the UnifyFS servers using ``unifyfs`` --------------------- - Start UnifyFS --------------------- +------------- +Start UnifyFS +------------- First, one must start the UnifyFS server process (``unifyfsd``) on the nodes in the job allocation. UnifyFS provides the ``unifyfs`` command line utility to @@ -57,7 +57,6 @@ adjust the consistency model, and control stage-in and stage-out of files. The full usage for ``unifyfs`` is as follows: .. code-block:: Bash - :linenos: [prompt]$ unifyfs --help @@ -72,18 +71,21 @@ The full usage for ``unifyfs`` is as follows: -h, --help print usage Command options for "start": - -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX) - -e, --exe= [OPTIONAL] where unifyfsd is installed - -m, --mount= [OPTIONAL] mount UnifyFS at - -s, --script= [OPTIONAL] to custom launch script - -t, --timeout= [OPTIONAL] wait until all servers become ready - -S, --share-dir= [REQUIRED] shared file system for use by servers - -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit - -i, --stage-in= [OPTIONAL] stage in manifest file(s) at + -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX) + -e, --exe= [OPTIONAL] where unifyfsd is installed + -m, --mount= [OPTIONAL] mount UnifyFS at + -s, --script= [OPTIONAL] to custom launch script + -t, --timeout= [OPTIONAL] wait until all servers become ready + -S, --share-dir= [REQUIRED] shared file system for use by servers + -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit + -i, --stage-in= [OPTIONAL] stage in file(s) listed in file + -T, --stage-timeout= [OPTIONAL] timeout for stage-in operation Command options for "terminate": - -s, --script= [OPTIONAL] to custom termination script - -o, --stage-out= [OPTIONAL] stage out manifest file(s) at + -o, --stage-out= [OPTIONAL] stage out file(s) listed in on termination + -T, --stage-timeout= [OPTIONAL] timeout for stage-out operation + -s, --script= [OPTIONAL] to custom termination script + -S, --share-dir= [REQUIRED for --stage-out] shared file system for use by servers After UnifyFS servers have been successfully started, you may run your @@ -92,18 +94,18 @@ Only applications that explicitly call ``unifyfs_mount()`` and access files under the specified mountpoint prefix will utilize UnifyFS for their I/O. All other applications will operate unchanged. --------------------- - Stop UnifyFS --------------------- +------------ +Stop UnifyFS +------------ -After all UnifyFS-enabled applications have completed running, you should -use ``unifyfs terminate`` to terminate the servers. Typically, one would pass -the ``--cleanup`` option to ``unifyfs start`` to have the servers remove -temporary data locally stored on each node after termination. +After all UnifyFS-enabled applications have completed running, use +``unifyfs terminate`` to terminate the servers. Pass the ``--cleanup`` option to +``unifyfs start`` to have the servers remove temporary data locally stored on +each node after termination. ------------------------------------- - Resource Manager Job Integration ------------------------------------- +-------------------------------- +Resource Manager Job Integration +-------------------------------- UnifyFS includes optional support for integrating directly with compatible resource managers to automatically start and stop servers at the beginning @@ -118,28 +120,176 @@ within the source repository at ``util/scripts/lsfcsm``. Support for the SLURM resource manager is under development. ------------------------------------------------ - Stage-in and Stage-out Manifest File Format ------------------------------------------------ - -The manifest file contains one or more file copy requests. -Each line in the manifest corresponds to one file copy request, -and it contains both the source and destination file paths. Currently, -directory copies are not supported. - -Each line is formatted as: `` ``. -If either of the filenames -contain whitespace or special characters, then both filenames should -be surrounded by double-quote characters (") (ASCII character 34 decimal). -The double-quote character and the linefeed end-of-line character are forbidden -in any filenames used in a unifyfs manifest file, but any other -characters are allowed, including control characters. -If a filename contains any characters that might be misinterpreted, then -enclosing the filename in double-quotes is always -a safe thing to do. +----------------------------------------- +Transferring Data In and Out of UnifyFS +----------------------------------------- + +Data can be transferred in/out of UnifyFS during server startup and termination, +or at any point during a job using two stand-alone applications. + +Transfer at Server Start/Terminate +********************************** + +The transfer subsystem within UnifyFS can be invoked by providing the +``-i|--stage-in`` option to ``unifyfs start`` to transfer files into UnifyFS: + +.. code-block:: Bash + + $ unifyfs start --stage-in=/path/to/input/manifest/file --share-dir=/path/to/shared/file/system + +and/or by providing the ``-o|--stage-out``, and consequently required +``-S|--share-dir``, option to ``unifyfs terminate`` to transfer files out of +UnifyFS: + +.. code-block:: Bash + + $ unifyfs terminate --stage-out=/path/to/output/manifest/file --share-dir=/path/to/shared/file/system + +A manifest file needs to be provided to the ``start``/``terminate`` commands in +order to specify the desired transfers. + +.. _manifest_file_label: + +Manifest File +^^^^^^^^^^^^^ + +UnifyFS's stage functionality requires a manifest file in order to move data. + +The manifest file contains one or more file copy requests. Each line in the +manifest corresponds to one transfer request, and it contains both the source +and destination file paths. Directory copies are currently not supported. + +Each line is formatted as: +`` ``. + +If either of the filenames contain whitespace or special characters, then both +filenames should be surrounded by double-quote characters (") (ASCII character +34 decimal). +The double-quote and linefeed end-of-line characters are not supported in any +filenames used in a unifyfs manifest file. Any other characters are allowed, +including control characters. If a filename contains any characters that might +be misinterpreted, then enclosing the filename in double-quotes is always a safe +thing to do. Here is an example of a valid stage-in manifest file: -``/scratch/users/me/input_data/input_1.dat /unifyfs/input/input_1.dat`` -``/home/users/me/configuration/run_12345.conf /unifyfs/config/run_12345.conf`` -``"/home/users/me/file with space.dat" "/unifyfs/file with space.dat"`` +.. code-block:: Bash + + $ [prompt] cat example_stage_in.manifest + + /scratch/users/me/input_data/input_1.dat /unifyfs/input/input_1.dat + /home/users/me/configuration/run_12345.conf /unifyfs/config/run_12345.conf + "/home/users/me/file with space.dat" "/unifyfs/file with space.dat" + +Transfer During Job +******************* + +Data can also be transferred in/out of UnifyFS using one of two stand-alone +applications. + +The stand-alone applications can be invoked at any time while the UnifyFS +servers are up and responding to requests. This allows for bringing in new input +and/or transferring results out to be verified before the job terminates. + +UnifyFS Stage Executable +^^^^^^^^^^^^^^^^^^^^^^^^ + +``$UNIFYFS_INSTALL/libexec/unifyfs-stage`` + +The ``start``/``terminate`` transfer API stage functionality can also be used +via the stand-alone application ``unifyfs-stage``. + +This application can be run at any time within a job to transfer new data into + or results out of UnifyFS. +A manifest file (see :ref:`above `_) needs to be provided +as an argument to use this approach. + +.. code-block:: Bash + + [prompt]$ ./unifyfs-stage --help + + Usage: unifyfs-stage [OPTION]... + + Transfer files between unifyfs volume and external file system. + The should contain list of files to be transferred, + and each line should be formatted as + + /source/file/path /destination/file/path + + OR in the case of filenames with spaces or special characters: + + "/source/file/path" "/destination/file/path" + + One file per line; Specifying directories is not supported. + + Available options: + -c, --checksum verify md5 checksum for each transfer + -h, --help print this usage + -m, --mountpoint= use as unifyfs mountpoint + (default: /unifyfs) + -p, --parallel transfer each file in parallel + (experimental) + -s, --share-dir= directory path for creating status file + -v, --verbose print noisy outputs + + Without the '-p, --parallel' option, a file is transferred by a single + process. If the '-p, --parallel' option is specified, each file will be + divided by multiple processes and transferred in parallel. + +Examples: + +.. code-block:: Bash + :caption: Serial Transfer + + $ srun -N 1 -n 1 $UNIFYFS_INSTALL/libexec/unifyfs-stage $MY_MANIFEST_FILE + +.. code-block:: Bash + :caption: Parallel Transfer + + $ srun -N 4 -n 8 $UNIFYFS_INSTALL/libexec/unifyfs-stage --parallel $MY_MANIFEST_FILE + +Transfer Executable +^^^^^^^^^^^^^^^^^^^ + +``$UNIFYFS_INSTALL/libexec/transfer-static`` + +.. note:: + + The ``transfer-gotcha`` executable is currently unusable due to an issue + that is being tracked. + +The transfer API can also be used during the job by invoking the stand-alone +``transfer`` application. It works similarly to the Unix ``cp`` command, with +source and destination, except being aware that it is copying files between an +external file system and internal UnifyFS. + +.. code-block:: Bash + + [prompt]$ transfer-static --help + + Usage: transfer-static [options...] + + Available options: + -d, --debug pause before running test + (handy for attaching in debugger) + -h, --help help message + -m, --mount= use for unifyfs + (default: /unifyfs) + -p, --parallel parallel transfer + -r, --rank= use for transfer (default: 0) + +Examples of using ``transfer-static``: + +.. code-block:: Bash + :caption: Serial Transfer + + $ srun -N 1 -n 1 $UNIFYFS_INSTALL/libexec/transfer-static /path/on/parallelfs/file.dat /unifyfs/file.dat + + $ srun -N 1 -n 1 $UNIFYFS_INSTALL/libexec/transfer-static /unifyfs/output.dat /scratch/my_output/output.dat + +.. code-block:: Bash + :caption: Parallel Transfer + + $ srun -N 4 -n 8 /path/to/libexec/transfer-static -parallel /path/on/parallelfs/file.dat /unifyfs/file.dat + + $ srun -N 4 -n 8 /path/to/libexec/transfer-static -parallel /unifyfs/output.dat /scratch/my_output/output.dat diff --git a/t/ci/800-stage-tests.sh b/t/ci/800-stage-tests.sh new file mode 100755 index 000000000..c2308889a --- /dev/null +++ b/t/ci/800-stage-tests.sh @@ -0,0 +1,249 @@ +#!/bin/bash + +# this is a unit speed test. It creates a randomized file +# on the parallel file system. Then it uses the transfer +# API to transfer that file into the UnifyFS space, then +# then transfers it back out again to a *different* location. +# The validation test is to compare the final file to the +# original file to make sure they're bit by bit identical. + +# env variable STAGE_FILE_SIZE_IN_MB sets the size +# this has a default below of 100 MB + +# As long as UNIFY_LOG_DIR is set this script will have +# places to put its temp files. This includes successive +# of this script internally keeping track of timing +# iteration indices. + +# These two variables are written to every timing data line: +# the variable STAGE_TEST_OVERALL_CONFIG is used to tag a whole set +# of tests. +# the variable STAGE_TEST_SPECIFIC_CONFIG tags lower-level config +# info, typically what's changing in the overall set of tests +# file size is taken care of, so the overall config variable +# will typically be what machine or software is being tested. +# The specific config might be a specific server configuration +# that's being checked or iterated over. + +test_description="UnifyFS Stage-in+Stage-out tests" + +# utility checking to make sure everything's in place. +STAGE_EXE=${UNIFYFS_EXAMPLES}/unifyfs-stage +test_expect_success "unify-stage executable exists" ' + test_path_is_file $STAGE_EXE +' +test_expect_success "\$UNIFYFS_LOG_DIR exists" ' + test_path_is_dir $UNIFYFS_LOG_DIR +' + +# make and check utility directories +STAGE_CFG_DIR="${UNIFYFS_LOG_DIR}/stage/config_800" +STAGE_LOG_DIR="${UNIFYFS_LOG_DIR}/stage/log_800" +STAGE_SRC_DIR="${UNIFYFS_LOG_DIR}/stage/stage_source_800" +STAGE_DST_DIR="${UNIFYFS_LOG_DIR}/stage/stage_destination_800" +mkdir -p $STAGE_CFG_DIR $STAGE_LOG_DIR $STAGE_SRC_DIR $STAGE_DST_DIR + +test_expect_success "stage testing dirs exist" ' + test_path_is_dir $STAGE_CFG_DIR && + test_path_is_dir $STAGE_LOG_DIR && + test_path_is_dir $STAGE_SRC_DIR && + test_path_is_dir $STAGE_DST_DIR +' + +# keeping track of where we are in iterations to *guarantee* even +# with weird iteration counts that the iter counter will always +# be unique and thus preserve data. +STAGE_TEST_INDEX_FILE="${STAGE_LOG_DIR}/stage_test_index.txt" +# ensure existence of index file +# index should consist of a number +if ! [ -e "$STAGE_TEST_INDEX_FILE" ] ; then + echo "1" > ${STAGE_TEST_INDEX_FILE} +fi +# the test index file now exists +STAGE_TEST_INDEX=`head -n1 ${STAGE_TEST_INDEX_FILE}` +NEW_STAGE_TEST_INDEX=$(( STAGE_TEST_INDEX + 1 )) +echo ${NEW_STAGE_TEST_INDEX} > ${STAGE_TEST_INDEX_FILE} +echo "stage index this run: ${STAGE_TEST_INDEX}" + +if [ ! $STAGE_FILE_SIZE_IN_MB ] ; then + STAGE_FILE_SIZE_IN_MB=100 +fi +echo "stage file test: file size is $STAGE_FILE_SIZE_IN_MB in MB" +STAGE_FILE_CFG=${STAGE_FILE_SIZE_IN_MB}MB_${STAGE_TEST_INDEX} + +# empty comment so format checker doesn't complain about next line +STAGE_SRC_FILE=${STAGE_SRC_DIR}/source_800_${STAGE_FILE_CFG}.file + +# fill initial file with random bytes +dd if=/dev/urandom bs=1M count=${STAGE_FILE_SIZE_IN_MB} of=${STAGE_SRC_FILE} + +test_expect_success "source.file exists" ' + test_path_is_file $STAGE_SRC_FILE +' + +rm -f ${STAGE_CFG_DIR}/* +rm -f ${STAGE_DST_DIR}/* + +test_expect_success "destination directory is empty" ' + test_dir_is_empty $STAGE_DST_DIR +' + +##### Serial unifyfs-stage tests ##### + +# set up what the intermediate filename will be within UnifyFS and also +# the final file name after copying it back out. Then use those +# filenames to create the two manifest files, one for copying the +# file in, and one for copying the file out. +STAGE_IM_FILE=${UNIFYFS_MP}/intermediate_${STAGE_FILE_CFG}.file +STAGE_DST_FILE=${STAGE_DST_DIR}/destination_800_${STAGE_FILE_CFG}.file +MAN_IN=${STAGE_CFG_DIR}/stage_IN_${STAGE_FILE_CFG}.manifest +MAN_OUT=${STAGE_CFG_DIR}/stage_OUT_${STAGE_FILE_CFG}.manifest +echo "\"${STAGE_SRC_FILE}\" \"${STAGE_IM_FILE}\"" > ${MAN_IN} +echo "\"${STAGE_IM_FILE}\" \"${STAGE_DST_FILE}\"" > ${MAN_OUT} + +test_expect_success "config directory now has manifest files" ' + test_path_is_file $MAN_IN && + test_path_is_file $MAN_OUT +' + +STAGE_IN_ERR=${STAGE_LOG_DIR}/stage_IN_800_${STAGE_FILE_CFG}.err +STAGE_OUT_ERR=${STAGE_LOG_DIR}/stage_OUT_800_${STAGE_FILE_CFG}.err + +# run and time the stage-in operation. +STAGEIN_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_IN_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c ${MAN_IN}" +echo "stagein_command: ${STAGEIN_COMMAND}" +TIME_IN_START=`date +%s` +my_stagein_output="$($STAGEIN_COMMAND)" +TIME_IN_END=`date +%s` + +# run and time the stage-out operation. +STAGEOUT_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_OUT_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c ${MAN_OUT}" +echo "stageOUT_command: ${STAGEOUT_COMMAND}" +TIME_OUT_START=`date +%s` +my_stageout_output="$($STAGEOUT_COMMAND)" +TIME_OUT_END=`date +%s` + +STAGE_IN_LOG=${STAGE_LOG_DIR}/stage_IN_800_${STAGE_FILE_CFG}.out +STAGE_OUT_LOG=${STAGE_LOG_DIR}/stage_OUT_800_${STAGE_FILE_CFG}.out +echo $my_stagein_output > $STAGE_IN_LOG +echo $my_stageout_output > $STAGE_OUT_LOG + +ELAPSED_TIME_IN=$(( ${TIME_IN_END} - ${TIME_IN_START} )) +ELAPSED_TIME_OUT=$(( ${TIME_OUT_END} - ${TIME_OUT_START} )) +echo "time to stage in: $ELAPSED_TIME_IN s" +echo "time to stage out: $ELAPSED_TIME_OUT s" + +test_expect_success "input file has been staged to output" ' + test_path_is_file $STAGE_DST_FILE +' + +# This block is used to indirectly get the test result back to us +# of whether the file comparison failed, so that we can put +# that result in the line of the timing file. +SUCCESS_TOTAL_BEFORE=${test_success} +test_expect_success "final output is identical to initial input" ' + test_cmp $STAGE_DST_FILE $STAGE_SRC_FILE +' +SUCCESS_TOTAL_AFTER=${test_success} + +# If the success total is *different*, then the final test +# (the file comparison after to before) passed. +# If they're the same, then it failed. +if [ ${SUCCESS_TOTAL_BEFORE} == ${SUCCESS_TOTAL_AFTER} ]; then + echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ + ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ + ^${STAGE_TEST_OVERALL_CONFIG}^ \ + @${STAGE_TEST_SPECIFIC_CONFIG}@ %FAIL%" \ + >> ${STAGE_LOG_DIR}/timings_serial_${JOB_ID}.dat +else + echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ + ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ + ^${STAGE_TEST_OVERALL_CONFIG}^ \ + @${STAGE_TEST_SPECIFIC_CONFIG}@ %GOOD%" \ + >> ${STAGE_LOG_DIR}/timings_serial_${JOB_ID}.dat +fi + + +##### Parallel unifyfs-stage tests ##### + +# TODO: Remove if block once parallel transfer logic is working +if test_have_prereq PARALLEL_TRANSFER_FIXED; then + # set up what the intermediate filename will be within UnifyFS and also + # the final file name after copying it back out. Then use those + # filenames to create the two manifest files, one for copying the + # file in, and one for copying the file out. + STAGE_IM_FILE=${UNIFYFS_MP}/intermediate_parallel_${STAGE_FILE_CFG}.file + STAGE_P_DST_FILE=${STAGE_DST_DIR}/destination_parallel_800_${STAGE_FILE_CFG}.file + MAN_IN=${STAGE_CFG_DIR}/stage_IN_parallel_${STAGE_FILE_CFG}.manifest + MAN_OUT=${STAGE_CFG_DIR}/stage_OUT_parallel_${STAGE_FILE_CFG}.manifest + echo "\"${STAGE_SRC_FILE}\" \"${STAGE_IM_FILE}\"" > ${MAN_IN} + echo "\"${STAGE_IM_FILE}\" \"${STAGE_P_DST_FILE}\"" > ${MAN_OUT} + + test_expect_success "config directory now has parallel manifest files" ' + test_path_is_file $MAN_IN && + test_path_is_file $MAN_OUT + ' + + STAGE_IN_ERR=${STAGE_LOG_DIR}/stage_IN_parallel_800_${STAGE_FILE_CFG}.err + STAGE_OUT_ERR=${STAGE_LOG_DIR}/stage_OUT_parallel_800_${STAGE_FILE_CFG}.err + + # run and time the stage-in operation. + STAGEIN_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_IN_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c -p ${MAN_IN}" + echo "stagein_command: ${STAGEIN_COMMAND}" + TIME_IN_START=`date +%s` + my_stagein_output="$($STAGEIN_COMMAND)" + TIME_IN_END=`date +%s` + + # run and time the stage-out operation. + STAGEOUT_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_OUT_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c -p ${MAN_OUT}" + echo "stageOUT_command: ${STAGEOUT_COMMAND}" + TIME_OUT_START=`date +%s` + my_stageout_output="$($STAGEOUT_COMMAND)" + TIME_OUT_END=`date +%s` + + STAGE_IN_LOG=${STAGE_LOG_DIR}/stage_IN_parallel_800_${STAGE_FILE_CFG}.out + STAGE_OUT_LOG=${STAGE_LOG_DIR}/stage_OUT_parallel_800_${STAGE_FILE_CFG}.out + echo $my_stagein_output > $STAGE_IN_LOG + echo $my_stageout_output > $STAGE_OUT_LOG + + ELAPSED_TIME_IN=$(( ${TIME_IN_END} - ${TIME_IN_START} )) + ELAPSED_TIME_OUT=$(( ${TIME_OUT_END} - ${TIME_OUT_START} )) + echo "time to stage in parallel: $ELAPSED_TIME_IN s" + echo "time to stage out parallel: $ELAPSED_TIME_OUT s" + + test_expect_success "parallel: input file has been staged to output" ' + test_path_is_file $STAGE_P_DST_FILE + ' + + # This block is used to indirectly get the test result back to us + # of whether the file comparison failed, so that we can put + # that result in the line of the timing file. + SUCCESS_TOTAL_BEFORE=${test_success} + test_expect_success "parallel: final output is identical to initial input" ' + test_cmp $STAGE_P_DST_FILE $STAGE_SRC_FILE + ' + SUCCESS_TOTAL_AFTER=${test_success} + + # If the success total is *different*, then the final test + # (the file comparison after to before) passed. + # If they're the same, then it failed. + if [ ${SUCCESS_TOTAL_BEFORE} == ${SUCCESS_TOTAL_AFTER} ]; then + echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ + ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ + ^${STAGE_TEST_OVERALL_CONFIG}^ \ + @${STAGE_TEST_SPECIFIC_CONFIG}@ %FAIL%" \ + >> ${STAGE_LOG_DIR}/timings_parallel_${JOB_ID}.dat + else + echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ + ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ + ^${STAGE_TEST_OVERALL_CONFIG}^ \ + @${STAGE_TEST_SPECIFIC_CONFIG}@ %GOOD%" \ + >> ${STAGE_LOG_DIR}/timings_parallel_${JOB_ID}.dat + fi + + test_expect_success "serial output is identical to parallel output" ' + test_cmp $STAGE_DST_FILE $STAGE_P_DST_FILE + ' +fi + +rm -f $STAGE_SRC_FILE diff --git a/t/ci/990-stop-server.sh b/t/ci/990-stop-server.sh index 79b7ed84f..54a8799cd 100755 --- a/t/ci/990-stop-server.sh +++ b/t/ci/990-stop-server.sh @@ -5,16 +5,22 @@ test_description="Stopping the UnifyFS server" -while [[ $# -gt 0 ]] +DONE_TESTING="1" + +for arg in "$@" do - case $1 in + case $arg in -h|--help) ci_dir=$(dirname "$(readlink -fm $BASH_SOURCE)") $ci_dir/001-setup.sh -h exit ;; + --allow-restart) + DONE_TESTING="0" + echo "Stop servers but allow restart and additional testing" + ;; *) - echo "usage ./990-stop-server.sh -h|--help" + echo "usage ./990-stop-server.sh -h|--help|--allow-restart" exit 1 ;; esac @@ -34,32 +40,42 @@ test_expect_success "verify unifyfsd has stopped" ' test_must_fail process_is_running unifyfsd 10 ' -# If UNIFYFS_MOUNTPOINT is an existing dir, verify that is it empty -test_expect_success REAL_MP "Verify UNIFYFS_MOUNTPOINT ($UNIFYFS_MP) is empty" ' - test_dir_is_empty $UNIFYFS_MP -' -# Cleanup posix mountpoint -test_expect_success POSIX "Cleanup UNIFYFS_CI_POSIX_MP: $UNIFYFS_CI_POSIX_MP" ' - rm -rf $UNIFYFS_CI_POSIX_MP/*posix* -' +if [ "$DONE_TESTING" -eq "1" ]; then -# cleanup_hosts -test_expect_success PDSH,CLEAN "Cleanup hosts" ' - cleanup_hosts -' -# Remove trap -# If any tests failed, the suite will exit with 1 which will trigger the trap. -# Since the hosts were already cleaned at this point, can remove trap to prevent -# cleanup_hosts from being called again. -trap - EXIT -# end here if running tests individually -if [[ -z $full_run ]]; then - ( test_done; ) - test_exit_code=$? + # If UNIFYFS_MOUNTPOINT is an existing dir, verify that is it empty + test_expect_success REAL_MP \ + "Verify UNIFYFS_MOUNTPOINT ($UNIFYFS_MP) is empty" ' + test_dir_is_empty $UNIFYFS_MP + ' + + # Cleanup posix mountpoint + test_expect_success POSIX \ + "Cleanup UNIFYFS_CI_POSIX_MP: $UNIFYFS_CI_POSIX_MP" ' + rm -rf $UNIFYFS_CI_POSIX_MP/*posix* + ' + + # cleanup_hosts + test_expect_success PDSH,CLEAN "Cleanup hosts" ' + cleanup_hosts + ' + + # Remove trap + # If any tests failed, the suite will exit with 1 which + # will trigger the trap. Since the hosts were already + # cleaned at this point, can remove trap to prevent + # cleanup_hosts from being called again. + trap - EXIT + + # end here if running tests individually + if [[ -z $UNIFYFS_CI_TESTS_FULL_RUN ]]; then + ( test_done; ) + test_exit_code=$? + + cd "$(dirname "$SHARNESS_TRASH_DIRECTORY")" - cd "$(dirname "$SHARNESS_TRASH_DIRECTORY")" + return $test_exit_code + fi - return $test_exit_code -fi +fi # if [ $DONE_TESTING -eq "1" ]]; then diff --git a/t/ci/RUN_CI_STAGE_TESTS.sh b/t/ci/RUN_CI_STAGE_TESTS.sh new file mode 100755 index 000000000..6760a6710 --- /dev/null +++ b/t/ci/RUN_CI_STAGE_TESTS.sh @@ -0,0 +1,101 @@ +#!/bin/sh + +# This script runs a full sweep of tests using the +# stage application to bring data into the UnifyFS file +# space and then back out. + +# This script runs 002-start-server.sh several times, while +# typically changing the server configuration for each block +# of runs. + +# Each test (generating a random file, copying into unifyfs, +# copying it back out, and then checking against the original) +# is done by 800-stage-tests.sh. This script sets up the +# server configurations (outer loop), iterates over file size +# (inner loop), running the 800- script each time. +# Then the final results are saved to a permanent directory +# and we exit the job. + +test_description="Overall Multiconfiguration UnifyFS Stage tests" + +SECONDS=0 +start_time=$SECONDS +echo "Started RUN_CI_STAGE_TETS.sh @: $(date)" + +# Set up UNIFYFS_CI_DIR if this script is called first +UNIFYFS_CI_DIR=${UNIFYFS_CI_DIR:-"$(dirname "$(readlink -fm $BASH_SOURCE)")"} + +# test_done gets called in 990-stop-server.sh if this is not set. +# If not set, tests can be run individually +UNIFYFS_CI_TESTS_FULL_RUN=true + +# If the user hasn't specified an overall test label, +# here's the boilerplate one. +if [ -z "$STAGE_TEST_OVERALL_CONFIG" ] ; then + STAGE_TEST_OVERALL_CONFIG="UnifyFS CI Generic Stage Test" +fi + +# Setup testing +source $UNIFYFS_CI_DIR/001-setup.sh + +# Determine time setup took +setup_time=$SECONDS +echo "Setup time -- $(elapsed_time start_time setup_time)" + +# Function for running the file size sweep run once for every +# server configuration +function single_server_sweep { + source $UNIFYFS_CI_DIR/002-start-server.sh + + export STAGE_FILE_SIZE_IN_MB="100" + source $UNIFYFS_CI_DIR/800-stage-tests.sh + + export STAGE_FILE_SIZE_IN_MB="250" + source $UNIFYFS_CI_DIR/800-stage-tests.sh + + export STAGE_FILE_SIZE_IN_MB="1000" + source $UNIFYFS_CI_DIR/800-stage-tests.sh + + export STAGE_FILE_SIZE_IN_MB="2000" + source $UNIFYFS_CI_DIR/800-stage-tests.sh +} + +# First configuration +export STAGE_TEST_SPECIFIC_CONFIG="8GB_spill" +export UNIFYFS_LOGIO_SPILL_SIZE=$((8 * $GB)) +single_server_sweep + +# Stop server with --allow-restart option if not final time stopping +source $UNIFYFS_CI_DIR/990-stop-server.sh --allow-restart + +# Second server configuration +export STAGE_TEST_SPECIFIC_CONFIG="16GB_spill" +export UNIFYFS_LOGIO_SPILL_SIZE=$((16 * $GB)) +single_server_sweep + +# Other server configurations would go here, each one calling +# single_server_sweep() after setting environment variables appropriately +# Ensure that only the final configuration doesn't provide the +# --allow-restart option when stopping the servers + +# Stop unifyfsd and cleanup +source $UNIFYFS_CI_DIR/990-stop-server.sh + +# Determine time testing took +testing_time=$SECONDS +echo "Testing time -- $(elapsed_time setup_time testing_time)" + +# Save off the timing files in a location that won't be deleted +mkdir -p ${UNIFYFS_CI_DIR}/stage_serial_test_timings \ + ${UNIFYFS_CI_DIR}/stage_parallel_test_timings +cp ${STAGE_LOG_DIR}/timings_serial_${JOB_ID}.dat \ + ${UNIFYFS_CI_DIR}/stage_serial_test_timings +cp ${STAGE_LOG_DIR}/timings_parallel_${JOB_ID}.dat \ + ${UNIFYFS_CI_DIR}/stage_parallel_test_timings + +end_time=$SECONDS +echo "All done @ $(date)" +echo "Total run time -- $(elapsed_time start_time end_time)" + +test_done +exit 0 diff --git a/t/ci/RUN_CI_TESTS.sh b/t/ci/RUN_CI_TESTS.sh index 0b3194050..0dae3e5a7 100755 --- a/t/ci/RUN_CI_TESTS.sh +++ b/t/ci/RUN_CI_TESTS.sh @@ -76,14 +76,14 @@ done SECONDS=0 start_time=$SECONDS -echo "Started RUN_TESTS.sh @: $(date)" +echo "Started RUN_CI_TESTS.sh @: $(date)" # Set up UNIFYFS_CI_DIR if this script is called first UNIFYFS_CI_DIR=${UNIFYFS_CI_DIR:-"$(dirname "$(readlink -fm $BASH_SOURCE)")"} # test_done gets called in 990-stop-server.sh if this is not set. # If not set, tests can be run individually -full_run=true +UNIFYFS_CI_TESTS_FULL_RUN=true # setup testing source $UNIFYFS_CI_DIR/001-setup.sh @@ -129,6 +129,10 @@ source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh # MPI-IO producer-consumer tests source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --mpiio +### unifyfs-stage tests + +source $UNIFYFS_CI_DIR/800-stage-tests.sh + ############################################################################## # DO NOT add additional tests after this point ############################################################################## From 1c8293cfc2575708ec6ec404fc3fd468a5abd7f1 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Thu, 28 Oct 2021 13:42:55 -0400 Subject: [PATCH 51/81] Define _STAT_VER if it's not already defined As of glibc v2.33, _STAT_VER is no longer defined. This commit adds code to #define it to 3 if it's not already defined. Fixes issue #687 --- client/src/unifyfs-sysio.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index bd28177cd..dd0826909 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -814,6 +814,16 @@ int UNIFYFS_WRAP(fstat)(int fd, struct stat* buf) * instead of using the absolute value 3. */ +/* + * NOTE 2: As of glibc-2.33, _STAT_VER is no longer defined in bits/stat.h. + * __xstat is also no longer declared in stat.h, but it still exists in the + * library, so HAVE___XSTAT will be true. (The same goes for __lxstat & + * __fxstat.) In such a case, we have to define _STAT_VER ourselves. + */ +#ifndef _STAT_VER + #define _STAT_VER 3 +#endif + #ifdef HAVE___XSTAT int UNIFYFS_WRAP(__xstat)(int vers, const char* path, struct stat* buf) { From 13d00249f35effa4f1ded7545f910deab87fba10 Mon Sep 17 00:00:00 2001 From: Kathryn Mohror Date: Thu, 28 Oct 2021 05:53:50 -0700 Subject: [PATCH 52/81] update CRUISE license For some reason the CRUISE license was actually the BurstFS license. Updated the file with the correct license text from the CRUISE repo: https://github.com/LLNL/cruise/blob/master/COPYRIGHT --- LICENSE.CRUISE | 91 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 30 deletions(-) diff --git a/LICENSE.CRUISE b/LICENSE.CRUISE index be7c25f7d..9190c364f 100644 --- a/LICENSE.CRUISE +++ b/LICENSE.CRUISE @@ -2,36 +2,67 @@ Copyright and BSD License --------------------- -Copyright (c) 2017, Lawrence Livermore National Security, LLC. -Produced at the Lawrence Livermore National Laboratory. +Copyright (c) 2014, Lawrence Livermore National Security, LLC. +Produced at the Lawrence Livermore National Laboratory +Written by + Raghunath Rajachandrasekar + Kathryn Mohror + Adam Moody +LLNL-CODE-642432. +All rights reserved. +This file is part of CRUISE. +For details, see https://github.com/hpc/cruise -Copyright (c) 2017, Florida State University. -Contributions from the Computer Architecture and Systems Research Laboratory (CASTL) -at the Department of Computer Science. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: -Written by - Teng Wang tw15g@my.fsu.edu - Adam Moody moody20@llnl.gov - Weikuan Yu wyu3@fsu.edu - Kento Sato kento@llnl.gov - Kathryn Mohror kathryn@llnl.gov -LLNL-CODE-728877. + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the disclaimer below. -All rights reserved. -This file is part of CRUISE. For details, see https://github.com/llnl/cruise. -Permission is hereby granted, free of charge, to any person obtaining a copy of this -software and associated documentation files (the "Software"), -to deal in the Software without restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and -to permit persons to whom the Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial -portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE -OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the disclaimer (as noted + below) in the documentation and/or other materials provided with + the distribution. + + - Neither the name of the LLNS/LLNL nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE +LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--------------------- +Additional BSD Notice +--------------------- + +1. This notice is required to be provided under our contract with the + U.S. Department of Energy (DOE). This work was produced at Lawrence + Livermore National Laboratory under Contract No. DE-AC52-07NA27344 + with the DOE. + +2. Neither the United States Government nor Lawrence Livermore + National Security, LLC nor any of their employees, makes any + warranty, express or implied, or assumes any liability or + responsibility for the accuracy, completeness, or usefulness of + any information, apparatus, product, or process disclosed, or + represents that its use would not infringe privately-owned rights. + +3. Also, reference herein to any specific commercial products, process, + or services by trade name, trademark, manufacturer or otherwise does + not necessarily constitute or imply its endorsement, recommendation, + or favoring by the United States Government or Lawrence Livermore + National Security, LLC. The views and opinions of authors expressed + herein do not necessarily state or reflect those of the United States + Government or Lawrence Livermore National Security, LLC, and shall + not be used for advertising or product endorsement purposes. From f12ade6aba0fd693c5868afc9e0994888a8189f5 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Fri, 22 Oct 2021 14:23:43 -0400 Subject: [PATCH 53/81] add server.local_extents mode Also: * add '-l|--laminate' as config option to examples testutil, and update write examples accordingly * add utility clients for library api-based laminate and remove TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.h" --- common/src/unifyfs_configurator.h | 3 +- configure.ac | 2 + examples/src/multi-write.c | 13 ++-- examples/src/read.c | 6 +- examples/src/testutil.h | 11 +++- examples/src/write-transfer.c | 14 +++-- examples/src/write.c | 16 ++--- examples/src/writeread.c | 36 +++++------ server/src/extent_tree.c | 33 +++++++++- server/src/extent_tree.h | 3 +- server/src/unifyfs_global.h | 9 ++- server/src/unifyfs_inode.c | 27 ++++++-- server/src/unifyfs_inode.h | 12 ++-- server/src/unifyfs_p2p_rpc.c | 38 ++++++++---- server/src/unifyfs_server.c | 10 +++ server/src/unifyfs_service_manager.c | 37 +++++------ server/src/unifyfs_service_manager.h | 3 +- util/Makefile.am | 2 +- util/unifyfs-api-client/Makefile.am | 1 + util/unifyfs-api-client/src/Makefile.am | 29 +++++++++ .../unifyfs-api-client/src/unifyfs-laminate.c | 61 +++++++++++++++++++ util/unifyfs-api-client/src/unifyfs-remove.c | 61 +++++++++++++++++++ 22 files changed, 337 insertions(+), 90 deletions(-) create mode 100644 util/unifyfs-api-client/Makefile.am create mode 100644 util/unifyfs-api-client/src/Makefile.am create mode 100644 util/unifyfs-api-client/src/unifyfs-laminate.c create mode 100644 util/unifyfs-api-client/src/unifyfs-remove.c diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index e76accc3c..3e50aedad 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -71,7 +71,7 @@ UNIFYFS_CFG_CLI(unifyfs, mountpoint, STRING, /unifyfs, "mountpoint directory", NULL, 'm', "specify full path to desired mountpoint") \ UNIFYFS_CFG(client, cwd, STRING, NULLSTRING, "current working directory", NULL) \ UNIFYFS_CFG(client, fsync_persist, BOOL, on, "persist written data to storage on fsync()", NULL) \ - UNIFYFS_CFG(client, local_extents, BOOL, off, "track extents to service reads of local data", NULL) \ + UNIFYFS_CFG(client, local_extents, BOOL, off, "cache extents within client to service local reads without consulting local server", NULL) \ UNIFYFS_CFG(client, max_files, INT, UNIFYFS_CLIENT_MAX_FILES, "client max file count", NULL) \ UNIFYFS_CFG(client, write_index_size, INT, UNIFYFS_CLIENT_WRITE_INDEX_SIZE, "write metadata index buffer size", NULL) \ UNIFYFS_CFG(client, write_sync, BOOL, off, "sync every write to server", NULL) \ @@ -90,6 +90,7 @@ UNIFYFS_CFG_CLI(runstate, dir, STRING, RUNDIR, "runstate file directory", configurator_directory_check, 'R', "specify full path to directory to contain server-local state") \ UNIFYFS_CFG_CLI(server, hostfile, STRING, NULLSTRING, "server hostfile name", NULL, 'H', "specify full path to server hostfile") \ UNIFYFS_CFG_CLI(server, init_timeout, INT, UNIFYFS_DEFAULT_INIT_TIMEOUT, "timeout of waiting for server initialization", NULL, 't', "timeout in seconds to wait for servers to be ready for clients") \ + UNIFYFS_CFG(server, local_extents, BOOL, off, "use server extents to service local reads without consulting file owner", NULL) \ UNIFYFS_CFG(server, max_app_clients, INT, UNIFYFS_SERVER_MAX_APP_CLIENTS, "maximum number of clients per application", NULL) \ UNIFYFS_CFG_CLI(sharedfs, dir, STRING, NULLSTRING, "shared file system directory", configurator_directory_check, 'S', "specify full path to directory to contain server shared files") \ diff --git a/configure.ac b/configure.ac index 84db84643..f45c8eecd 100755 --- a/configure.ac +++ b/configure.ac @@ -402,6 +402,8 @@ AC_CONFIG_FILES([Makefile util/scripts/lsfcsm/Makefile util/unifyfs/Makefile util/unifyfs/src/Makefile + util/unifyfs-api-client/Makefile + util/unifyfs-api-client/src/Makefile util/unifyfs-stage/Makefile util/unifyfs-stage/src/Makefile]) diff --git a/examples/src/multi-write.c b/examples/src/multi-write.c index 369f82499..b7064da7a 100644 --- a/examples/src/multi-write.c +++ b/examples/src/multi-write.c @@ -61,7 +61,7 @@ int check_file(char* file) int fd; int rc; int matched = 0; - fd = open(file, O_RDONLY, 0222); + fd = open(file, O_RDONLY); memset(tmpbuf, 0, sizeof(tmpbuf)); rc = read(fd, tmpbuf, sizeof(tmpbuf)); @@ -159,12 +159,13 @@ int do_test(test_cfg* cfg) } close(fds[i]); - rc = chmod(file[i], 0444); - if (rc != 0) { - printf("%s failed to chmod, rc = %d\n", file[i], rc); - exit(1); + if (cfg->laminate) { + rc = chmod(file[i], 0444); + if (rc != 0) { + printf("%s failed to chmod, rc = %d\n", file[i], rc); + exit(1); + } } - } /* Verify the writes to the files match the values in bigbuf[] */ diff --git a/examples/src/read.c b/examples/src/read.c index 8a8a1aadf..bca19f5fa 100644 --- a/examples/src/read.c +++ b/examples/src/read.c @@ -207,7 +207,7 @@ int main(int argc, char* argv[]) timer_start_barrier(cfg, &time_check); rc = check_read_req_batch(cfg, num_reqs, reqs); if (rc) { - test_abort(cfg, rc); + test_print_once(cfg, "ERROR: data check failed!"); } timer_stop_barrier(cfg, &time_check); test_print_verbose_once(cfg, "DEBUG: finished data check"); @@ -236,8 +236,8 @@ int main(int argc, char* argv[]) "I/O block size: %.2lf KiB\n" "I/O request size: %.2lf KiB\n" "Number of processes: %d\n" - "Each process wrote: %.2lf MiB\n" - "Total data written: %.2lf MiB\n" + "Each process read: %.2lf MiB\n" + "Total data read: %.2lf MiB\n" "File stat time: %.6lf sec\n" "File open time: %.6lf sec\n" "Maximum read time: %.6lf sec\n" diff --git a/examples/src/testutil.h b/examples/src/testutil.h index 3c9c81438..da0677b91 100644 --- a/examples/src/testutil.h +++ b/examples/src/testutil.h @@ -131,6 +131,7 @@ typedef struct { int io_pattern; /* N1 or NN */ int io_check; /* use lipsum to verify data */ int io_shuffle; /* read and write different extents */ + int laminate; /* laminate file after writing */ int pre_wr_trunc; /* truncate file before writing */ int post_wr_trunc; /* truncate file after writing */ int use_aio; /* use asynchronous IO */ @@ -239,6 +240,7 @@ void test_config_print(test_cfg* cfg) fprintf(fp, "\t io_pattern = %s\n", io_pattern_str(cfg->io_pattern)); fprintf(fp, "\t io_check = %d\n", cfg->io_check); fprintf(fp, "\t io_shuffle = %d\n", cfg->io_shuffle); + fprintf(fp, "\t laminate = %d\n", cfg->laminate); fprintf(fp, "\t pre_trunc = %d\n", cfg->pre_wr_trunc); fprintf(fp, "\t post_trunc = %d\n", cfg->post_wr_trunc); fprintf(fp, "\t use_aio = %d\n", cfg->use_aio); @@ -523,9 +525,10 @@ int test_is_static(const char* program) // common options for all tests -static const char* test_short_opts = "Ab:c:dD:f:hklLm:Mn:No:p:PrSt:T:uUvVx"; +static const char* test_short_opts = "aAb:c:dD:f:hklLm:Mn:No:p:PrSt:T:uUvVx"; static const struct option test_long_opts[] = { + { "library-api", 0, 0, 'a' }, { "aio", 0, 0, 'A' }, { "blocksize", 1, 0, 'b' }, { "chunksize", 1, 0, 'c' }, @@ -534,7 +537,7 @@ static const struct option test_long_opts[] = { { "file", 1, 0, 'f' }, { "help", 0, 0, 'h' }, { "check", 0, 0, 'k' }, - { "library-api", 0, 0, 'l' }, + { "laminate", 0, 0, 'l' }, { "listio", 0, 0, 'L' }, { "mount", 1, 0, 'm' }, { "mpiio", 0, 0, 'M' }, @@ -560,6 +563,8 @@ static const char* test_usage_str = "Usage: %s [options...]\n" "\n" "Available options:\n" + " -a, --library-api use UnifyFS library API instead of POSIX I/O\n" + " (default: off)\n" " -A, --aio use asynchronous I/O instead of read|write\n" " (default: off)\n" " -b, --blocksize= I/O block size\n" @@ -574,7 +579,7 @@ static const char* test_usage_str = " (default: 'testfile')\n" " -k, --check check data contents upon read\n" " (default: off)\n" - " -l, --library-api use UnifyFS library API instead of POSIX I/O\n" + " -l, --laminate laminate file after writing all data\n" " (default: off)\n" " -L, --listio use lio_listio instead of read|write\n" " (default: off)\n" diff --git a/examples/src/write-transfer.c b/examples/src/write-transfer.c index 98361dfcc..4f665532f 100644 --- a/examples/src/write-transfer.c +++ b/examples/src/write-transfer.c @@ -297,14 +297,16 @@ int main(int argc, char* argv[]) "DEBUG: finished sync (elapsed=%.6lf sec)", time_sync.elapsed_sec_all); - // laminate - test_print_verbose_once(cfg, "DEBUG: laminating target file"); - rc = write_laminate(cfg, target_file); - if (rc) { - test_abort(cfg, rc); + if (cfg->laminate) { + // laminate + test_print_verbose_once(cfg, "DEBUG: laminating target file"); + rc = write_laminate(cfg, target_file); + if (rc) { + test_abort(cfg, rc); + } } - // stat file post-laminate + // stat file test_print_verbose_once(cfg, "DEBUG: calling stat() on target file"); stat_cmd(cfg, target_file); diff --git a/examples/src/write.c b/examples/src/write.c index cceaba3bb..39f1bd2d3 100644 --- a/examples/src/write.c +++ b/examples/src/write.c @@ -221,14 +221,16 @@ int main(int argc, char* argv[]) free(reqs); reqs = NULL; - // laminate - timer_start_barrier(cfg, &time_laminate); - rc = write_laminate(cfg, target_file); - if (rc) { - test_abort(cfg, rc); + if (cfg->laminate) { + // laminate + timer_start_barrier(cfg, &time_laminate); + rc = write_laminate(cfg, target_file); + if (rc) { + test_abort(cfg, rc); + } + timer_stop_barrier(cfg, &time_laminate); + test_print_verbose_once(cfg, "DEBUG: finished laminate"); } - timer_stop_barrier(cfg, &time_laminate); - test_print_verbose_once(cfg, "DEBUG: finished laminate"); // timer to wrap all parts of write operation timer_stop_barrier(cfg, &time_create2laminate); diff --git a/examples/src/writeread.c b/examples/src/writeread.c index 15c57ea00..890ea6412 100644 --- a/examples/src/writeread.c +++ b/examples/src/writeread.c @@ -308,24 +308,26 @@ int main(int argc, char* argv[]) time_stat_pre2.elapsed_sec_all); } - // laminate - timer_start_barrier(cfg, &time_laminate); - rc = write_laminate(cfg, target_file); - if (rc) { - test_abort(cfg, rc); - } - timer_stop_barrier(cfg, &time_laminate); - test_print_verbose_once(cfg, - "DEBUG: finished laminate (elapsed=%.6lf sec)", - time_laminate.elapsed_sec_all); + if (cfg->laminate) { + // laminate + timer_start_barrier(cfg, &time_laminate); + rc = write_laminate(cfg, target_file); + if (rc) { + test_abort(cfg, rc); + } + timer_stop_barrier(cfg, &time_laminate); + test_print_verbose_once(cfg, + "DEBUG: finished laminate (elapsed=%.6lf sec)", + time_laminate.elapsed_sec_all); - // stat file post-laminate - timer_start_barrier(cfg, &time_stat_post); - stat_cmd(cfg, target_file); - timer_stop_barrier(cfg, &time_stat_post); - test_print_verbose_once(cfg, - "DEBUG: finished stat post-laminate (elapsed=%.6lf sec)", - time_stat_post.elapsed_sec_all); + // stat file post-laminate + timer_start_barrier(cfg, &time_stat_post); + stat_cmd(cfg, target_file); + timer_stop_barrier(cfg, &time_stat_post); + test_print_verbose_once(cfg, + "DEBUG: finished stat post-laminate (elapsed=%.6lf sec)", + time_stat_post.elapsed_sec_all); + } // post-write cleanup free(wr_buf); diff --git a/server/src/extent_tree.c b/server/src/extent_tree.c index f17a6c9ad..4ff4aead8 100644 --- a/server/src/extent_tree.c +++ b/server/src/extent_tree.c @@ -646,16 +646,22 @@ int extent_tree_get_chunk_list( struct extent_tree* extent_tree, /* extent tree to search */ unsigned long offset, /* starting logical offset */ unsigned long len, /* length of extent */ - unsigned int* n_chunks, /* [out] number of extents returned */ - chunk_read_req_t** chunks) /* [out] extent array */ + unsigned int* n_chunks, /* [out] number of chunks returned */ + chunk_read_req_t** chunks, /* [out] chunk array */ + int* extent_covered) /* [out] set=1 if extent fully covered */ { int ret = 0; unsigned int count = 0; unsigned long end = offset + len - 1; struct extent_tree_node* first = NULL; + struct extent_tree_node* last = NULL; struct extent_tree_node* next = NULL; chunk_read_req_t* out_chunks = NULL; chunk_read_req_t* current = NULL; + unsigned long prev_end = 0; + bool gap_found = false; + + *extent_covered = 0; extent_tree_rdlock(extent_tree); @@ -663,12 +669,31 @@ int extent_tree_get_chunk_list( next = first; while (next && next->start <= end) { count++; + + if (!gap_found) { + unsigned long curr_start = next->start; + if (next != first) { + /* check for a gap between current and previous extent */ + if ((prev_end + 1) != curr_start) { + gap_found = true; + } + } + prev_end = next->end; + } + + /* iterate to next extent */ + last = next; next = extent_tree_iter(extent_tree, next); } *n_chunks = count; if (0 == count) { + gap_found = true; goto out_unlock; + } else { + if ((first->start > offset) || (last->end < end)) { + gap_found = true; + } } out_chunks = calloc(count, sizeof(*out_chunks)); @@ -693,6 +718,10 @@ int extent_tree_get_chunk_list( out_unlock: extent_tree_unlock(extent_tree); + if (!gap_found) { + *extent_covered = 1; + } + return ret; } diff --git a/server/src/extent_tree.h b/server/src/extent_tree.h index ef7d55369..ac53ba61e 100644 --- a/server/src/extent_tree.h +++ b/server/src/extent_tree.h @@ -168,7 +168,8 @@ int extent_tree_get_chunk_list( unsigned long offset, /* starting logical offset */ unsigned long len, /* length of extent */ unsigned int* n_chunks, /* [out] number of chunks returned */ - chunk_read_req_t** chunks); /* [out] extent array */ + chunk_read_req_t** chunks, /* [out] chunk array */ + int* extent_covered); /* [out] set=1 if extent fully covered */ /* dump method for debugging extent trees */ static inline diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index be1fa6214..9bb4d9c83 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -74,9 +74,14 @@ typedef struct { int pmi_rank; } server_info_t; -extern size_t glb_num_servers; /* number of entries in glb_servers array */ +/* number of entries in glb_servers array */ +extern size_t glb_num_servers; -extern struct unifyfs_inode_tree* global_inode_tree; /* global inode tree */ +/* global inode tree */ +extern struct unifyfs_inode_tree* global_inode_tree; + +/* flag to control the use of server local extents for faster local reads */ +extern bool use_server_local_extents; // NEW READ REQUEST STRUCTURES typedef enum { diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c index 4397d847c..0596a0c55 100644 --- a/server/src/unifyfs_inode.c +++ b/server/src/unifyfs_inode.c @@ -511,11 +511,15 @@ int unifyfs_inode_get_extents(int gfid, size_t* n, int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, unsigned int* n_chunks, - chunk_read_req_t** chunks) + chunk_read_req_t** chunks, + int* full_coverage) { int ret = UNIFYFS_SUCCESS; struct unifyfs_inode* ino = NULL; int gfid = extent->gfid; + int covered = 0; + + *full_coverage = 0; unifyfs_inode_tree_rdlock(global_inode_tree); { @@ -529,7 +533,8 @@ int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, unsigned long offset = extent->offset; unsigned long len = extent->length; ret = extent_tree_get_chunk_list(ino->extents, offset, len, - n_chunks, chunks); + n_chunks, chunks, + &covered); if (ret) { LOGERR("failed to get chunks for gfid:%d, ret=%d", gfid, ret); @@ -546,6 +551,7 @@ int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, for (unsigned int i = 0; i < *n_chunks; i++) { (*chunks)[i].gfid = gfid; } + *full_coverage = covered; } else { *n_chunks = 0; *chunks = NULL; @@ -578,9 +584,11 @@ int compare_chunk_read_reqs(const void* _c1, const void* _c2) int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, unifyfs_inode_extent_t* extents, unsigned int* n_locs, - chunk_read_req_t** chunklocs) + chunk_read_req_t** chunklocs, + int* full_coverage) { int ret = UNIFYFS_SUCCESS; + int fully_covered = 1; unsigned int i = 0; unsigned int j = 0; unsigned int n_chunks = 0; @@ -588,6 +596,11 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, unsigned int* n_resolved = NULL; chunk_read_req_t** resolved = NULL; + /* set default output parameter values */ + *n_locs = 0; + *chunklocs = NULL; + *full_coverage = 0; + void* buf = calloc(n_extents, (sizeof(*n_resolved) + sizeof(*resolved))); if (NULL == buf) { LOGERR("failed to allocate memory"); @@ -605,14 +618,19 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, LOGDBG("resolving extent request [gfid=%d, offset=%lu, length=%lu]", current->gfid, current->offset, current->length); + int covered = 0; ret = unifyfs_inode_get_extent_chunks(current, - &n_resolved[i], &resolved[i]); + &n_resolved[i], &resolved[i], + &covered); if (ret) { LOGERR("failed to resolve extent request " "[gfid=%d, offset=%lu, length=%lu] (ret=%d)", current->gfid, current->offset, current->length, ret); goto out_fail; } + if (!covered) { + fully_covered = 0; + } n_chunks += n_resolved[i]; } @@ -652,6 +670,7 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, *n_locs = n_chunks; *chunklocs = chunks; + *full_coverage = fully_covered; out_fail: if (ret != UNIFYFS_SUCCESS) { diff --git a/server/src/unifyfs_inode.h b/server/src/unifyfs_inode.h index 0ee7c85f9..c52c0f0a8 100644 --- a/server/src/unifyfs_inode.h +++ b/server/src/unifyfs_inode.h @@ -170,12 +170,14 @@ int unifyfs_inode_laminate(int gfid); * * @param[out] n_chunks number of output chunk locations * @param[out] chunks array of output chunk locations + * @param[out] full_coverage set to 1 if chunks fully cover extent * * @return UNIFYFS_SUCCESS, or error code */ int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, unsigned int* n_chunks, - chunk_read_req_t** chunks); + chunk_read_req_t** chunks, + int* full_coverage); /** * @brief Get chunk locations for an array of file extents @@ -183,15 +185,17 @@ int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, * @param n_extents number of input extents * @param extents array or requested extents * - * @param[out] n_locs number of output chunk locations - * @param[out] chunklocs array of output chunk locations + * @param[out] n_locs number of output chunk locations + * @param[out] chunklocs array of output chunk locations + * @param[out] full_coverage set to 1 if chunks fully cover extents * * @return UNIFYFS_SUCCESS, or error code */ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, unifyfs_inode_extent_t* extents, unsigned int* n_locs, - chunk_read_req_t** chunklocs); + chunk_read_req_t** chunklocs, + int* full_coverage); /** * @brief calls extents_tree_span, which will do: diff --git a/server/src/unifyfs_p2p_rpc.c b/server/src/unifyfs_p2p_rpc.c index 94f0a2aa5..1e3172706 100644 --- a/server/src/unifyfs_p2p_rpc.c +++ b/server/src/unifyfs_p2p_rpc.c @@ -562,23 +562,40 @@ int unifyfs_invoke_find_extents_rpc(int gfid, *chunks = NULL; int owner_rank = hash_gfid_to_server(gfid); + int is_owner = (owner_rank == glb_pmi_rank); - /* do local inode metadata lookup to check for laminated */ + /* do local inode metadata lookup */ unifyfs_file_attr_t attrs; int ret = sm_get_fileattr(gfid, &attrs); if (ret == UNIFYFS_SUCCESS) { - if ((owner_rank == glb_pmi_rank) || - (attrs.is_laminated && attrs.is_shared)) { - /* do local lookup */ + int file_laminated = (attrs.is_shared && attrs.is_laminated); + if (is_owner || use_server_local_extents || file_laminated) { + /* try local lookup */ + int full_coverage = 0; ret = sm_find_extents(gfid, (size_t)num_extents, extents, - num_chunks, chunks); + num_chunks, chunks, &full_coverage); if (ret) { LOGERR("failed to find extents for gfid=%d (ret=%d)", gfid, ret); - } else if (*num_chunks == 0) { - LOGDBG("extent lookup found no matching chunks"); + } else if (0 == *num_chunks) { /* found no data */ + LOGDBG("local lookup found no matching chunks"); + } else { /* found some chunks */ + if (full_coverage) { + LOGDBG("local lookup found chunks with full coverage"); + } else { + LOGDBG("local lookup found chunks with partial coverage"); + } + } + if (is_owner || file_laminated || full_coverage) { + return ret; + } + /* else, fall through to owner lookup */ + if (*num_chunks > 0) { + /* release local results */ + *num_chunks = 0; + free(*chunks); + *chunks = NULL; } - return ret; } } @@ -633,11 +650,10 @@ int unifyfs_invoke_find_extents_rpc(int gfid, /* get number of chunks */ unsigned int n_chks = (unsigned int) out.num_locations; if (n_chks > 0) { - /* got some chunks to read, get bulk buffer - * holding chunk location data */ + /* get bulk buffer with chunk locations */ buf_sz = (size_t)n_chks * sizeof(chunk_read_req_t); buf = pull_margo_bulk_buffer(preq.handle, out.locations, buf_sz, - NULL); + NULL); if (NULL == buf) { LOGERR("failed to get bulk chunk locations"); ret = UNIFYFS_ERROR_MARGO; diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index 96ccf40d7..3d29630e1 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -53,6 +53,8 @@ size_t glb_num_servers; // size of glb_servers array unifyfs_cfg_t server_cfg; +bool use_server_local_extents; // = false + /* arraylist to track failed clients */ arraylist_t* failed_clients; // = NULL @@ -317,6 +319,14 @@ int main(int argc, char* argv[]) } } + if (server_cfg.server_local_extents != NULL) { + bool enable = false; + rc = configurator_bool_val(server_cfg.server_local_extents, &enable); + if ((0 == rc) && enable) { + use_server_local_extents = true; + } + } + // setup clean termination by signal memset(&sa, 0, sizeof(struct sigaction)); sa.sa_handler = exit_request; diff --git a/server/src/unifyfs_service_manager.c b/server/src/unifyfs_service_manager.c index 96e4fce6c..4e319f014 100644 --- a/server/src/unifyfs_service_manager.c +++ b/server/src/unifyfs_service_manager.c @@ -522,30 +522,24 @@ int sm_find_extents(int gfid, size_t num_extents, unifyfs_inode_extent_t* extents, unsigned int* out_num_chunks, - chunk_read_req_t** out_chunks) + chunk_read_req_t** out_chunks, + int* full_coverage) { - int owner_rank = hash_gfid_to_server(gfid); - int is_owner = (owner_rank == glb_pmi_rank); - - /* do local inode metadata lookup to check for laminated */ + /* do local inode metadata lookup */ unifyfs_file_attr_t attrs; int ret = unifyfs_inode_metaget(gfid, &attrs); if (ret == UNIFYFS_SUCCESS) { - /* do local lookup */ - if (is_owner || attrs.is_laminated) { - unsigned int n_extents = (unsigned int)num_extents; - ret = unifyfs_inode_resolve_extent_chunks(n_extents, extents, - out_num_chunks, - out_chunks); - if (ret) { - LOGERR("failed to find extents for gfid=%d (rc=%d)", - gfid, ret); - } else if (*out_num_chunks == 0) { - LOGDBG("extent lookup found no matching chunks"); - } - } else { - LOGWARN("cannot find extents for unlaminated file at non-owner"); - ret = UNIFYFS_FAILURE; + /* do inode extent lookup */ + unsigned int n_extents = (unsigned int)num_extents; + ret = unifyfs_inode_resolve_extent_chunks(n_extents, extents, + out_num_chunks, + out_chunks, + full_coverage); + if (ret) { + LOGERR("failed to find extents for gfid=%d (rc=%d)", + gfid, ret); + } else if (*out_num_chunks == 0) { + LOGDBG("extent lookup for gfid=%d found no matching chunks", gfid); } } return ret; @@ -860,10 +854,11 @@ static int process_find_extents_rpc(server_rpc_req_t* req) num_extents, gfid, sender); /* find chunks for given extents */ + int full_coverage = 0; unsigned int num_chunks = 0; chunk_read_req_t* chunk_locs = NULL; int ret = sm_find_extents(gfid, num_extents, extents, - &num_chunks, &chunk_locs); + &num_chunks, &chunk_locs, &full_coverage); margo_free_input(req->handle, in); free(in); diff --git a/server/src/unifyfs_service_manager.h b/server/src/unifyfs_service_manager.h index 004c59353..f15e3ece0 100644 --- a/server/src/unifyfs_service_manager.h +++ b/server/src/unifyfs_service_manager.h @@ -86,7 +86,8 @@ int sm_find_extents(int gfid, size_t num_extents, unifyfs_inode_extent_t* extents, unsigned int* out_num_chunks, - chunk_read_req_t** out_chunks); + chunk_read_req_t** out_chunks, + int* full_coverage); int sm_transfer(int client_server, int client_app, diff --git a/util/Makefile.am b/util/Makefile.am index 69e40c9c1..a86ea3f8f 100644 --- a/util/Makefile.am +++ b/util/Makefile.am @@ -1 +1 @@ -SUBDIRS = scripts unifyfs unifyfs-stage +SUBDIRS = scripts unifyfs unifyfs-api-client unifyfs-stage diff --git a/util/unifyfs-api-client/Makefile.am b/util/unifyfs-api-client/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/util/unifyfs-api-client/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/util/unifyfs-api-client/src/Makefile.am b/util/unifyfs-api-client/src/Makefile.am new file mode 100644 index 000000000..5acc4254c --- /dev/null +++ b/util/unifyfs-api-client/src/Makefile.am @@ -0,0 +1,29 @@ +libexec_PROGRAMS = \ + unifyfs-laminate \ + unifyfs-remove + +CLEANFILES = $(libexec_PROGRAMS) + +# Compiler/linker flags + +AM_CFLAGS = -Wall -Werror + +api_client_cppflags = \ + $(AM_CPPFLAGS) \ + -I$(top_srcdir)/client/src \ + -I$(top_srcdir)/common/src + +api_client_ldadd = $(top_builddir)/client/src/libunifyfs_api.la +api_client_ldflags = $(AM_LDFLAGS) -static + +# Per-target flags begin here + +unifyfs_laminate_CPPFLAGS = $(api_client_cppflags) +unifyfs_laminate_LDADD = $(api_client_ldadd) +unifyfs_laminate_LDFLAGS = $(api_client_ldflags) +unifyfs_laminate_SOURCES = unifyfs-laminate.c + +unifyfs_remove_CPPFLAGS = $(api_client_cppflags) +unifyfs_remove_LDADD = $(api_client_ldadd) +unifyfs_remove_LDFLAGS = $(api_client_ldflags) +unifyfs_remove_SOURCES = unifyfs-remove.c \ No newline at end of file diff --git a/util/unifyfs-api-client/src/unifyfs-laminate.c b/util/unifyfs-api-client/src/unifyfs-laminate.c new file mode 100644 index 000000000..28c14d32e --- /dev/null +++ b/util/unifyfs-api-client/src/unifyfs-laminate.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include "unifyfs_api.h" + +static void usage(char* arg0) +{ + fprintf(stderr, "USAGE: %s [ ...]\n", + arg0); + fflush(stderr); +} + +int main(int argc, char** argv) +{ + if (argc < 3) { + fprintf(stderr, "USAGE ERROR: expected two or more arguments!\n"); + usage(argv[0]); + return -1; + } + + char* mountpt = argv[1]; + + unifyfs_handle fshdl; + unifyfs_rc urc = unifyfs_initialize(mountpt, NULL, 0, &fshdl); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS ERROR: init failed at mountpoint %s - %s\n", + mountpt, unifyfs_rc_enum_description(urc)); + return 1; + } + + for (int i = 2; i < argc; i++) { + char* filepath = argv[i]; + urc = unifyfs_laminate(fshdl, filepath); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS ERROR: failed to laminate file %s - %s\n", + filepath, unifyfs_rc_enum_description(urc)); + return 2; + } + } + + urc = unifyfs_finalize(fshdl); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS ERROR: failed to finalize - %s\n", + unifyfs_rc_enum_description(urc)); + return 3; + } + + return 0; +} diff --git a/util/unifyfs-api-client/src/unifyfs-remove.c b/util/unifyfs-api-client/src/unifyfs-remove.c new file mode 100644 index 000000000..2427b621f --- /dev/null +++ b/util/unifyfs-api-client/src/unifyfs-remove.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021, Lawrence Livermore National Security, LLC. + * Produced at the Lawrence Livermore National Laboratory. + * + * Copyright 2021, UT-Battelle, LLC. + * + * LLNL-CODE-741539 + * All rights reserved. + * + * This is the license for UnifyFS. + * For details, see https://github.com/LLNL/UnifyFS. + * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. + */ + +#include +#include "unifyfs_api.h" + +static void usage(char* arg0) +{ + fprintf(stderr, "USAGE: %s [ ...]\n", + arg0); + fflush(stderr); +} + +int main(int argc, char** argv) +{ + if (argc < 3) { + fprintf(stderr, "USAGE ERROR: expected two arguments!\n"); + usage(argv[0]); + return -1; + } + + char* mountpt = argv[1]; + + unifyfs_handle fshdl; + unifyfs_rc urc = unifyfs_initialize(mountpt, NULL, 0, &fshdl); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS ERROR: init failed at mountpoint %s - %s\n", + mountpt, unifyfs_rc_enum_description(urc)); + return 1; + } + + for (int i = 2; i < argc; i++) { + char* filepath = argv[i]; + urc = unifyfs_remove(fshdl, filepath); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS ERROR: failed to remove file %s - %s\n", + filepath, unifyfs_rc_enum_description(urc)); + return 2; + } + } + + urc = unifyfs_finalize(fshdl); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS ERROR: failed to finalize - %s\n", + unifyfs_rc_enum_description(urc)); + return 3; + } + + return 0; +} From 2d816a9bdd2373d4c3bddc94945adac4cf9984eb Mon Sep 17 00:00:00 2001 From: Rob Latham Date: Tue, 30 Nov 2021 18:15:56 -0600 Subject: [PATCH 54/81] use flexible arrays newer gcc-11 will complain about overrunning arrays if we don't use the C99 standard "flexible array" notation --- common/src/tinyexpr.c | 2 +- common/src/tinyexpr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/src/tinyexpr.c b/common/src/tinyexpr.c index 90ed8fce5..0ab137a59 100644 --- a/common/src/tinyexpr.c +++ b/common/src/tinyexpr.c @@ -84,7 +84,7 @@ typedef struct state { static te_expr *new_expr(const int type, const te_expr *parameters[]) { const int arity = ARITY(type); const int psize = sizeof(void*) * arity; - const int size = (sizeof(te_expr) - sizeof(void*)) + psize + (IS_CLOSURE(type) ? sizeof(void*) : 0); + const int size = sizeof(te_expr) + psize + (IS_CLOSURE(type) ? sizeof(void*) : 0); te_expr *ret = malloc(size); memset(ret, 0, size); if (arity && parameters) { diff --git a/common/src/tinyexpr.h b/common/src/tinyexpr.h index 827863362..5889e16bf 100644 --- a/common/src/tinyexpr.h +++ b/common/src/tinyexpr.h @@ -35,7 +35,7 @@ extern "C" { typedef struct te_expr { int type; union {double value; const double *bound; const void *function;}; - void *parameters[1]; + void *parameters[]; } te_expr; From b70f8a6a39cebd0a88092814cea0c257d1b420cd Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Fri, 10 Dec 2021 14:01:05 -0500 Subject: [PATCH 55/81] fix bugs causing issue 698 --- examples/src/testutil.h | 14 +++++++++----- examples/src/testutil_rdwr.h | 6 +++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/examples/src/testutil.h b/examples/src/testutil.h index da0677b91..c593b4e03 100644 --- a/examples/src/testutil.h +++ b/examples/src/testutil.h @@ -638,6 +638,10 @@ int test_process_argv(test_cfg* cfg, while ((ch = getopt_long(argc, argv, test_short_opts, test_long_opts, NULL)) != -1) { switch (ch) { + case 'a': + cfg->use_api = 1; + break; + case 'A': cfg->use_aio = 1; break; @@ -667,7 +671,7 @@ int test_process_argv(test_cfg* cfg, break; case 'l': - cfg->use_api = 1; + cfg->laminate = 1; break; case 'L': @@ -1097,11 +1101,11 @@ int test_open_file(test_cfg* cfg, const char* filepath, int access) if (cfg->use_api) { #ifndef DISABLE_UNIFYFS - unifyfs_rc rc = unifyfs_open(cfg->fshdl, access, filepath, - &(cfg->gfid)); - if (UNIFYFS_SUCCESS != rc) { + unifyfs_rc urc = unifyfs_open(cfg->fshdl, access, filepath, + &(cfg->gfid)); + if (UNIFYFS_SUCCESS != urc) { test_print(cfg, "ERROR: unifyfs_open(%s) failed - %s", - filepath, unifyfs_rc_enum_description(rc)); + filepath, unifyfs_rc_enum_description(urc)); return -1; } #endif diff --git a/examples/src/testutil_rdwr.h b/examples/src/testutil_rdwr.h index b439b57c3..cbea8c766 100644 --- a/examples/src/testutil_rdwr.h +++ b/examples/src/testutil_rdwr.h @@ -328,7 +328,7 @@ int write_sync(test_cfg* cfg) return ENOTSUP; #else unifyfs_rc urc = unifyfs_sync(cfg->fshdl, cfg->gfid); - if (UNIFYFS_SUCCESS != rc) { + if (UNIFYFS_SUCCESS != urc) { test_print(cfg, "unifyfs_sync(%s, gfid=%d) failed - %s", cfg->filename, cfg->gfid, unifyfs_rc_enum_description(urc)); @@ -372,7 +372,7 @@ int write_laminate(test_cfg* cfg, const char* filepath) return ENOTSUP; #else unifyfs_rc urc = unifyfs_laminate(cfg->fshdl, filepath); - if (UNIFYFS_SUCCESS != rc) { + if (UNIFYFS_SUCCESS != urc) { test_print(cfg, "unifyfs_laminate(%s) failed - %s", cfg->filename, unifyfs_rc_enum_description(urc)); rc = -1; @@ -405,7 +405,7 @@ int stat_file(test_cfg* cfg, const char* filepath) #else unifyfs_status us; unifyfs_rc urc = unifyfs_stat(cfg->fshdl, cfg->gfid, &us); - if (UNIFYFS_SUCCESS != rc) { + if (UNIFYFS_SUCCESS != urc) { test_print(cfg, "unifyfs_stat(%s, gfid=%d) failed - %s", cfg->filename, cfg->gfid, unifyfs_rc_enum_description(urc)); From 5b5cdb95ac0de330bd6368619027a1d8d09ba5d7 Mon Sep 17 00:00:00 2001 From: CamStan Date: Wed, 8 Dec 2021 17:06:10 -0800 Subject: [PATCH 56/81] Add laminate option to CI tests This adds an option to the CI test suites to allow for laminating when testing. A recent update changed the behavior or the example programs to no longer laminate by default. Also adds a set of writeread CI tests that shuffle the I/O when using MPI-IO. Updated docs --- docs/configuration.rst | 31 ++++++++++--------- docs/examples.rst | 12 ++++++-- docs/run.rst | 8 ++--- t/ci/001-setup.sh | 4 +-- t/ci/100-writeread-tests.sh | 24 ++++++++++++--- t/ci/110-write-tests.sh | 12 +++++++- t/ci/120-read-tests.sh | 12 +++++++- t/ci/300-producer-consumer-tests.sh | 12 +++++++- t/ci/RUN_CI_TESTS.sh | 47 ++++++++++++++++++++++++++--- 9 files changed, 126 insertions(+), 36 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index a815c3d56..8c083ba90 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -113,6 +113,15 @@ files. spill_dir STRING path to spillover data directory =========== ====== ============================================================ +.. table:: ``[margo]`` section - margo server NA settings + :widths: auto + + === ==== ================================================================================= + Key Type Description + === ==== ================================================================================= + tcp BOOL Use TCP for server-to-server rpcs (default: on, turn off to enable libfabric RMA) + === ==== ================================================================================= + .. table:: ``[runstate]`` section - server runstate settings :widths: auto @@ -125,21 +134,13 @@ files. .. table:: ``[server]`` section - server settings :widths: auto - ============ ====== ============================================================================= - Key Type Description - ============ ====== ============================================================================= - hostfile STRING path to server hostfile - init_timeout INT timeout in seconds to wait for servers to be ready for clients (default: 120) - ============ ====== ============================================================================= - -.. table:: ``[margo]`` section - margo server NA settings - :widths: auto - - === ==== ================================================================================= - Key Type Description - === ==== ================================================================================= - tcp BOOL Use TCP for server-to-server rpcs (default: on, turn off to enable libfabric RMA) - === ==== ================================================================================= + ============= ====== ============================================================================= + Key Type Description + ============= ====== ============================================================================= + hostfile STRING path to server hostfile + init_timeout INT timeout in seconds to wait for servers to be ready for clients (default: 120) + local_extents BOOL use server extents to service local reads without consulting file owner + ============= ====== ============================================================================= .. table:: ``[sharedfs]`` section - server shared files settings :widths: auto diff --git a/docs/examples.rst b/docs/examples.rst index 80ef83a9c..74a918b33 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -96,8 +96,8 @@ to aid in this process. Usage: write-static [options...] Available options: - -a, --appid= use given application id - (default: 0) + -a, --library-api use UnifyFS library API instead of POSIX I/O + (default: off) -A, --aio use asynchronous I/O instead of read|write (default: off) -b, --blocksize= I/O block size @@ -106,10 +106,14 @@ to aid in this process. (default: 1 MiB) -d, --debug for debugging, wait for input (at rank 0) at start (default: off) + -D, --destfile= transfer destination file name (or path) outside mountpoint + (default: none) -f, --file= target file name (or path) under mountpoint (default: 'testfile') -k, --check check data contents upon read (default: off) + -l, --laminate laminate file after writing all data + (default: off) -L, --listio use lio_listio instead of read|write (default: off) -m, --mount= use for unifyfs @@ -126,12 +130,16 @@ to aid in this process. (default: 'n1') -P, --prdwr use pread|pwrite instead of read|write (default: off) + -r, --reuse-filename remove and reuse the same target file name + (default: off) -S, --stdio use fread|fwrite instead of read|write (default: off) -t, --pre-truncate= truncate file to size (B) before writing (default: off) -T, --post-truncate= truncate file to size (B) after writing (default: off) + -u, --unlink unlink target file + (default: off) -U, --disable-unifyfs do not use UnifyFS (default: enable UnifyFS) -v, --verbose print verbose information diff --git a/docs/run.rst b/docs/run.rst index b4f67a4f4..0afac6973 100644 --- a/docs/run.rst +++ b/docs/run.rst @@ -200,8 +200,8 @@ The ``start``/``terminate`` transfer API stage functionality can also be used via the stand-alone application ``unifyfs-stage``. This application can be run at any time within a job to transfer new data into - or results out of UnifyFS. -A manifest file (see :ref:`above `_) needs to be provided +or results out of UnifyFS. +A manifest file (see :ref:`above `) needs to be provided as an argument to use this approach. .. code-block:: Bash @@ -290,6 +290,6 @@ Examples of using ``transfer-static``: .. code-block:: Bash :caption: Parallel Transfer - $ srun -N 4 -n 8 /path/to/libexec/transfer-static -parallel /path/on/parallelfs/file.dat /unifyfs/file.dat + $ srun -N 4 -n 8 /path/to/libexec/transfer-static --parallel /path/on/parallelfs/file.dat /unifyfs/file.dat - $ srun -N 4 -n 8 /path/to/libexec/transfer-static -parallel /unifyfs/output.dat /scratch/my_output/output.dat + $ srun -N 4 -n 8 /path/to/libexec/transfer-static --parallel /unifyfs/output.dat /scratch/my_output/output.dat diff --git a/t/ci/001-setup.sh b/t/ci/001-setup.sh index 921dd5468..f445a0033 100755 --- a/t/ci/001-setup.sh +++ b/t/ci/001-setup.sh @@ -196,10 +196,7 @@ nlt=${TMPDIR}/unifyfs.${USER}.${SYSTEM_NAME}.${JOB_ID} export UNIFYFS_CI_TEMP_DIR=${UNIFYFS_CI_TEMP_DIR:-$nlt} $JOB_RUN_ONCE_PER_NODE mkdir -p $UNIFYFS_CI_TEMP_DIR export UNIFYFS_RUNSTATE_DIR=${UNIFYFS_RUNSTATE_DIR:-$UNIFYFS_CI_TEMP_DIR} -export UNIFYFS_META_DB_PATH=${UNIFYFS_META_DB_PATH:-$UNIFYFS_CI_TEMP_DIR} echo "$infomsg UNIFYFS_RUNSTATE_DIR set as $UNIFYFS_RUNSTATE_DIR" -echo "$infomsg UNIFYFS_META_DB_PATH set as $UNIFYFS_META_DB_PATH" -echo "$infomsg Set UNIFYFS_CI_TEMP_DIR to change both of these to same path" # storage nls=$nlt @@ -208,6 +205,7 @@ export UNIFYFS_LOGIO_SPILL_DIR=${UNIFYFS_LOGIO_SPILL_DIR:-$nls} echo "$infomsg UNIFYFS_LOGIO_SPILL_SIZE set as $UNIFYFS_LOGIO_SPILL_SIZE" echo "$infomsg UNIFYFS_LOGIO_SPILL_DIR set as $UNIFYFS_LOGIO_SPILL_DIR" +export UNIFYFS_SERVER_MAX_APP_CLIENTS=${UNIFYFS_SERVER_MAX_APP_CLIENTS:-512} ########## Set up mountpoints and sharness testing prereqs ########## diff --git a/t/ci/100-writeread-tests.sh b/t/ci/100-writeread-tests.sh index 99b250a29..753950e64 100755 --- a/t/ci/100-writeread-tests.sh +++ b/t/ci/100-writeread-tests.sh @@ -63,6 +63,9 @@ do $ci_dir/001-setup.sh -h exit ;; + -l|--laminate) + writeread_laminate=yes + ;; -M|--mpiio) [ -n "$writeread_io_type" ] && { echo "ERROR: mutually exclusive options provided"; \ @@ -91,6 +94,12 @@ unify_test_writeread() { rc=$? lcount=$(echo "$app_output" | wc -l) + if [ -n "$writeread_laminate" ]; then + expected_lcount=29 + else + expected_lcount=17 + fi + # Test the return code and resulting line count to determine pass/fail. # If mode is posix, also test that the file or files exist at the mountpoint # depending on whether testing shared file or file-per-process. @@ -99,7 +108,7 @@ unify_test_writeread() { test_expect_success "$app_name $2: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 29 && + test $lcount = $expected_lcount && if [[ $io_pattern =~ (n1)$ ]]; then test_path_is_file ${UNIFYFS_CI_POSIX_MP}/$filename else @@ -109,7 +118,7 @@ unify_test_writeread() { else test_expect_success "$app_name $2: (line_count=${lcount}, rc=$rc)" ' test $rc = 0 && - test $lcount = 29 + test $lcount = $expected_lcount ' fi } @@ -145,16 +154,19 @@ fi # Reset additional behavior to default behavior="" +# Laminate after writing all data +if [ -n "$writeread_laminate" ]; then + behavior="$behavior -l" +fi + # Set I/O type if [ -n "$writeread_io_type" ]; then behavior="$behavior $writeread_io_type" - unset writeread_io_type # prevent option being picked up by subsequent runs fi # Read different data than written if [ -n "$writeread_shuffle" ]; then behavior="$behavior -x" - unset writeread_shuffle # prevent option being picked up by subsequent runs fi # For each io_size, test with each io_pattern and for each io_pattern, test each @@ -167,3 +179,7 @@ for io_size in "${io_sizes[@]}"; do done done done + +unset writeread_io_type +unset writeread_laminate +unset writeread_shuffle diff --git a/t/ci/110-write-tests.sh b/t/ci/110-write-tests.sh index c5b50bc7f..da946d0eb 100755 --- a/t/ci/110-write-tests.sh +++ b/t/ci/110-write-tests.sh @@ -64,6 +64,9 @@ do $ci_dir/001-setup.sh -h exit ;; + -l|--laminate) + write_laminate=yes + ;; -M|--mpiio) [ -n "$write_io_type" ] && { echo "ERROR: mutually exclusive options provided"; \ @@ -142,10 +145,14 @@ fi # Reset additional behavior to default behavior="" +# Laminate after writing all data +if [ -n "$write_laminate" ]; then + behavior="$behavior -l" +fi + # Set I/O type if [ -n "$write_io_type" ]; then behavior="$behavior $write_io_type" - unset write_io_type # prevent option being picked up by subsequent runs fi # For each io_size, test with each io_pattern and for each io_pattern, test each @@ -158,3 +165,6 @@ for io_size in "${io_sizes[@]}"; do done done done + +unset write_io_type +unset write_laminate diff --git a/t/ci/120-read-tests.sh b/t/ci/120-read-tests.sh index 3decb3bdd..b8aae08de 100755 --- a/t/ci/120-read-tests.sh +++ b/t/ci/120-read-tests.sh @@ -67,6 +67,9 @@ do $ci_dir/001-setup.sh -h exit ;; + -l|--laminate) + read_laminate=yes + ;; -M|--mpiio) [ -n "$read_io_type" ] && { echo "ERROR: mutually exclusive options provided"; \ @@ -129,10 +132,14 @@ fi # Reset additional behavior to default behavior="" +# Put "-l" in filename to ensure reading correct file +if [ -n "$read_laminate" ]; then + behavior="$behavior -l" +fi + # Set I/O type if [ -n "$read_io_type" ]; then behavior="$behavior $read_io_type" - unset read_io_type # prevent option being picked up by subsequent runs fi # For each io_size, test with each io_pattern and for each io_pattern, test each @@ -145,3 +152,6 @@ for io_size in "${io_sizes[@]}"; do done done done + +unset read_io_type +unset read_laminate diff --git a/t/ci/300-producer-consumer-tests.sh b/t/ci/300-producer-consumer-tests.sh index 0381f1422..a58325edf 100755 --- a/t/ci/300-producer-consumer-tests.sh +++ b/t/ci/300-producer-consumer-tests.sh @@ -69,6 +69,9 @@ do $ci_dir/001-setup.sh -h exit ;; + -l|--laminate) + producer_consumer_laminate=yes + ;; -M|--mpiio) [ -n "$write_io_type" ] && { echo "ERROR: mutually exclusive options provided"; \ @@ -148,10 +151,14 @@ fi # Reset additional behavior to default behavior="" +# Laminate after writing all data +if [ -n "$producer_consumer_laminate" ]; then + behavior="$behavior -l" +fi + # Set I/O type if [ -n "$write_io_type" ]; then behavior="$behavior $write_io_type" - unset write_io_type # prevent option being picked up by subsequent runs fi # For each io_size, test with each io_pattern and for each io_pattern, test each @@ -164,3 +171,6 @@ for io_size in "${io_sizes[@]}"; do done done done + +unset producer_consumer_laminate +unset write_io_type diff --git a/t/ci/RUN_CI_TESTS.sh b/t/ci/RUN_CI_TESTS.sh index 0dae3e5a7..477e92080 100755 --- a/t/ci/RUN_CI_TESTS.sh +++ b/t/ci/RUN_CI_TESTS.sh @@ -100,36 +100,73 @@ echo "Setup time -- $(elapsed_time start_time setup_time)" # 990-stop-server.sh) in the desired order to run them. ############################################################################## +### POSIX-IO tests ### + # POSIX-IO writeread example tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh +source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate # POSIX-IO writeread example with I/O shuffle tests +source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --shuffle + +# POSIX-IO writeread example w/out laminate tests +source $UNIFYFS_CI_DIR/100-writeread-tests.sh + +# POSIX-IO writeread example w/out laminate tests source $UNIFYFS_CI_DIR/100-writeread-tests.sh --shuffle # POSIX-IO write example tests -source $UNIFYFS_CI_DIR/110-write-tests.sh +source $UNIFYFS_CI_DIR/110-write-tests.sh --laminate # POSIX-IO read example tests +source $UNIFYFS_CI_DIR/120-read-tests.sh --laminate + +# POSIX-IO write example w/out laminate tests +source $UNIFYFS_CI_DIR/110-write-tests.sh + +# POSIX-IO read example w/out laminate tests source $UNIFYFS_CI_DIR/120-read-tests.sh +### MPI-IO tests ### + # MPI-IO writeread example tests +source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --mpiio + +# MPI-IO writeread example with I/O shuffle tests +source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --shuffle --mpiio + +# MPI-IO writeread example w/out laminate tests source $UNIFYFS_CI_DIR/100-writeread-tests.sh --mpiio +# MPI-IO writeread example w/out laminate tests +source $UNIFYFS_CI_DIR/100-writeread-tests.sh --shuffle --mpiio + # MPI-IO write example tests -source $UNIFYFS_CI_DIR/110-write-tests.sh --mpiio +source $UNIFYFS_CI_DIR/110-write-tests.sh --laminate --mpiio # MPI-IO read example tests +source $UNIFYFS_CI_DIR/120-read-tests.sh --laminate --mpiio + +# MPI-IO write example w/out laminate tests +source $UNIFYFS_CI_DIR/110-write-tests.sh --mpiio + +# MPI-IO read example w/out laminate tests source $UNIFYFS_CI_DIR/120-read-tests.sh --mpiio -### Producer-Consumer workload tests +### Producer-Consumer workload tests ### # POSIX-IO producer-consumer tests +source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --laminate + +# POSIX-IO producer-consumer w/out laminate tests source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh # MPI-IO producer-consumer tests +source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --laminate --mpiio + +# MPI-IO producer-consumer w/out laminate tests source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --mpiio -### unifyfs-stage tests +### unifyfs-stage tests ### source $UNIFYFS_CI_DIR/800-stage-tests.sh From 333ea6e496c7e7833d22fef3d8d560d9b7ac8916 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Tue, 4 Jan 2022 17:26:46 -0500 Subject: [PATCH 57/81] update margo usage * use timed versions of RPC forwarding functions, and add configuration settings for client-server and server-server RPC timeouts * for group RPCs, allow responses in any order * for RPCs that transfer extent metadata, avoid use of unifyfs_inode_extent_t and extent_tree_node structures NOTE: due to a bug in prior margo versions, we need to use 0.9.6 or later to use the timed async forwarding TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.h" --- .travis.yml | 2 +- bootstrap.sh | 13 +- client/src/margo_client.c | 158 +++++---- common/src/unifyfs_configurator.h | 8 +- common/src/unifyfs_const.h | 9 +- docs/dependencies.rst | 17 +- server/src/extent_tree.c | 506 +++++++++++++-------------- server/src/extent_tree.h | 130 +++---- server/src/margo_server.c | 48 ++- server/src/margo_server.h | 2 + server/src/unifyfs_fops_rpc.c | 60 +--- server/src/unifyfs_group_rpc.c | 339 +++++++++--------- server/src/unifyfs_inode.c | 73 ++-- server/src/unifyfs_inode.h | 31 +- server/src/unifyfs_p2p_rpc.c | 278 ++++++++------- server/src/unifyfs_p2p_rpc.h | 4 +- server/src/unifyfs_request_manager.h | 2 +- server/src/unifyfs_server.c | 33 +- server/src/unifyfs_service_manager.c | 17 +- server/src/unifyfs_service_manager.h | 4 +- server/src/unifyfs_transfer.c | 26 +- server/src/unifyfs_transfer.h | 2 +- 22 files changed, 914 insertions(+), 848 deletions(-) diff --git a/.travis.yml b/.travis.yml index 87d1e68aa..7e5b8ac3a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -130,7 +130,7 @@ before_install: install: - . $HOME/spack/share/spack/setup-env.sh - spack install gotcha@1.0.3 && spack load gotcha@1.0.3 - - spack install mochi-margo ^mercury~boostsys ^libfabric fabrics=rxm,sockets,tcp && spack load argobots && spack load mercury && spack load mochi-margo + - spack install mochi-margo@0.9.6 ^mercury~boostsys ^libfabric fabrics=rxm,sockets,tcp && spack load argobots && spack load mercury && spack load mochi-margo - spack install spath~mpi && spack load spath # prepare build environment - GOTCHA_INSTALL=$(spack location -i gotcha) diff --git a/bootstrap.sh b/bootstrap.sh index a7e90ac0d..df1237ead 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -47,10 +47,10 @@ if [ $use_old_margo -eq 1 ]; then mercury_version="v1.0.1" margo_version="v0.4.3" else - argobots_version="v1.0.1" - libfabric_version="v1.11.1" - mercury_version="v2.0.0" - margo_version="v0.9.1" + argobots_version="v1.1" + libfabric_version="v1.12.1" + mercury_version="v2.0.1" + margo_version="v0.9.6" repos+=(https://github.com/json-c/json-c.git) fi @@ -120,7 +120,7 @@ if [ "$automake_sub_version" -lt "15" ]; then popd # build automake - echo "### building automake v1.15 ###" + echo "### building automake ###" pushd automake-1.15 ./configure --prefix=$INSTALL_DIR make @@ -148,8 +148,9 @@ else fi echo "### building GOTCHA ###" +gotcha_version="1.0.3" cd GOTCHA -git checkout 1.0.3 +git checkout $gotcha_version mkdir -p build && cd build cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" .. make -j $make_nproc && make install diff --git a/client/src/margo_client.c b/client/src/margo_client.c index 714b2640d..b3cc7f2b5 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -117,7 +117,10 @@ int unifyfs_client_rpc_init(void) } /* initialize margo */ - ctx->mid = margo_init(proto, MARGO_SERVER_MODE, 1, 1); + int use_progress_thread = 1; + int ult_pool_sz = 1; + ctx->mid = margo_init(proto, MARGO_SERVER_MODE, use_progress_thread, + ult_pool_sz); assert(ctx->mid); /* get server margo address */ @@ -216,6 +219,17 @@ static hg_handle_t create_handle(hg_id_t id) return handle; } +static int forward_to_server(hg_handle_t hdl, void* input_ptr) +{ + double timeout_msec = UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC; + hg_return_t hret = margo_forward_timed(hdl, input_ptr, timeout_msec); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward_timed() failed - %s", HG_Error_to_string(hret)); + return UNIFYFS_ERROR_MARGO; + } + return UNIFYFS_SUCCESS; +} + /* invokes the mount rpc function */ int invoke_client_mount_rpc(unifyfs_client* client) { @@ -237,11 +251,11 @@ int invoke_client_mount_rpc(unifyfs_client* client) /* call rpc function */ LOGDBG("invoking the mount rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of mount rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* free memory on input struct */ @@ -251,7 +265,7 @@ int invoke_client_mount_rpc(unifyfs_client* client) /* decode response */ int ret; unifyfs_mount_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; @@ -267,7 +281,7 @@ int invoke_client_mount_rpc(unifyfs_client* client) } margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -319,23 +333,23 @@ int invoke_client_attach_rpc(unifyfs_client* client) /* call rpc function */ LOGDBG("invoking the attach rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of attach rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_attach_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -366,23 +380,23 @@ int invoke_client_unmount_rpc(unifyfs_client* client) /* call rpc function */ LOGDBG("invoking the unmount rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of unmount rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_unmount_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -425,23 +439,23 @@ int invoke_client_metaset_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the metaset rpc function in client - gfid:%d file:%s", in.attr.gfid, in.attr.filename); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of metaset rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_metaset_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -472,17 +486,17 @@ int invoke_client_metaget_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the metaget rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of metaget rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_metaget_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; @@ -496,7 +510,7 @@ int invoke_client_metaget_rpc(unifyfs_client* client, } margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -527,17 +541,17 @@ int invoke_client_filesize_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the filesize rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of filesize rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_filesize_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; @@ -546,7 +560,7 @@ int invoke_client_filesize_rpc(unifyfs_client* client, } margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -582,23 +596,23 @@ int invoke_client_transfer_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the transfer rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of transfer rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_transfer_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -630,23 +644,23 @@ int invoke_client_truncate_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the truncate rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of truncate rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_truncate_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -676,23 +690,23 @@ int invoke_client_unlink_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the unlink rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of unlink rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_unlink_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -722,23 +736,23 @@ int invoke_client_laminate_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the laminate rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of laminate rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_laminate_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -768,23 +782,23 @@ int invoke_client_sync_rpc(unifyfs_client* client, /* call rpc function */ LOGINFO("invoking the sync rpc function in client"); - hg_return_t hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of sync rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ int ret; unifyfs_fsync_out_t out; - hret = margo_get_output(handle, &out); + hg_return_t hret = margo_get_output(handle, &out); if (hret == HG_SUCCESS) { LOGDBG("Got response ret=%" PRIi32, out.ret); ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } @@ -827,11 +841,11 @@ int invoke_client_mread_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the mread rpc function in client"); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_server(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of mread rpc to server failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ @@ -843,7 +857,7 @@ int invoke_client_mread_rpc(unifyfs_client* client, ret = (int) out.ret; margo_free_output(handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index 3e50aedad..e187215ad 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -71,7 +71,7 @@ UNIFYFS_CFG_CLI(unifyfs, mountpoint, STRING, /unifyfs, "mountpoint directory", NULL, 'm', "specify full path to desired mountpoint") \ UNIFYFS_CFG(client, cwd, STRING, NULLSTRING, "current working directory", NULL) \ UNIFYFS_CFG(client, fsync_persist, BOOL, on, "persist written data to storage on fsync()", NULL) \ - UNIFYFS_CFG(client, local_extents, BOOL, off, "cache extents within client to service local reads without consulting local server", NULL) \ + UNIFYFS_CFG(client, local_extents, BOOL, off, "use client-cached extents to service local reads without consulting local server", NULL) \ UNIFYFS_CFG(client, max_files, INT, UNIFYFS_CLIENT_MAX_FILES, "client max file count", NULL) \ UNIFYFS_CFG(client, write_index_size, INT, UNIFYFS_CLIENT_WRITE_INDEX_SIZE, "write metadata index buffer size", NULL) \ UNIFYFS_CFG(client, write_sync, BOOL, off, "sync every write to server", NULL) \ @@ -84,13 +84,15 @@ UNIFYFS_CFG(logio, shmem_size, INT, UNIFYFS_LOGIO_SHMEM_SIZE, "log-based I/O shared memory region size", NULL) \ UNIFYFS_CFG(logio, spill_size, INT, UNIFYFS_LOGIO_SPILL_SIZE, "log-based I/O spillover file size", NULL) \ UNIFYFS_CFG(logio, spill_dir, STRING, NULLSTRING, "spillover directory", configurator_directory_check) \ - UNIFYFS_CFG(margo, lazy_connect, BOOL, off, "wait until first communication with server to resolve its connection address", NULL) \ + UNIFYFS_CFG(margo, client_pool_size, INT, UNIFYFS_MARGO_POOL_SZ, "size of server's ULT pool for client-server RPCs", NULL) \ + UNIFYFS_CFG(margo, lazy_connect, BOOL, on, "wait until first communication with server to resolve its connection address", NULL) \ + UNIFYFS_CFG(margo, server_pool_size, INT, UNIFYFS_MARGO_POOL_SZ, "size of server's ULT pool for server-server RPCs", NULL) \ UNIFYFS_CFG(margo, tcp, BOOL, on, "use TCP for server-to-server margo RPCs", NULL) \ UNIFYFS_CFG(meta, range_size, INT, UNIFYFS_META_DEFAULT_SLICE_SZ, "metadata range size", NULL) \ UNIFYFS_CFG_CLI(runstate, dir, STRING, RUNDIR, "runstate file directory", configurator_directory_check, 'R', "specify full path to directory to contain server-local state") \ UNIFYFS_CFG_CLI(server, hostfile, STRING, NULLSTRING, "server hostfile name", NULL, 'H', "specify full path to server hostfile") \ UNIFYFS_CFG_CLI(server, init_timeout, INT, UNIFYFS_DEFAULT_INIT_TIMEOUT, "timeout of waiting for server initialization", NULL, 't', "timeout in seconds to wait for servers to be ready for clients") \ - UNIFYFS_CFG(server, local_extents, BOOL, off, "use server extents to service local reads without consulting file owner", NULL) \ + UNIFYFS_CFG(server, local_extents, BOOL, off, "use server-cached extents to service local reads without consulting file owner", NULL) \ UNIFYFS_CFG(server, max_app_clients, INT, UNIFYFS_SERVER_MAX_APP_CLIENTS, "maximum number of clients per application", NULL) \ UNIFYFS_CFG_CLI(sharedfs, dir, STRING, NULLSTRING, "shared file system directory", configurator_directory_check, 'S', "specify full path to directory to contain server shared files") \ diff --git a/common/src/unifyfs_const.h b/common/src/unifyfs_const.h index d9afef6a0..18cf9e569 100644 --- a/common/src/unifyfs_const.h +++ b/common/src/unifyfs_const.h @@ -52,13 +52,18 @@ #define UNIFYFS_CLIENT_WRITE_INDEX_SIZE (20 * MIB) #define UNIFYFS_CLIENT_MAX_READ_COUNT 1000 /* max # active read requests */ #define UNIFYFS_CLIENT_READ_TIMEOUT_SECONDS 60 -#define UNIFYFS_CLIENT_MAX_ACTIVE_REQUESTS 64 /* max concurrent client reqs */ +#define UNIFYFS_CLIENT_MAX_ACTIVE_REQUESTS 256 /* max concurrent client reqs */ // Log-based I/O Default Values #define UNIFYFS_LOGIO_CHUNK_SIZE (4 * MIB) #define UNIFYFS_LOGIO_SHMEM_SIZE (256 * MIB) #define UNIFYFS_LOGIO_SPILL_SIZE (4 * GIB) +// Margo Default Values +#define UNIFYFS_MARGO_POOL_SZ 4 +#define UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC 5000 /* 5.0 sec */ +#define UNIFYFS_MARGO_SERVER_SERVER_TIMEOUT_MSEC 15000 /* 15.0 sec */ + // Metadata Default Values #define UNIFYFS_META_DEFAULT_SLICE_SZ MIB /* data slice size for metadata */ @@ -67,7 +72,7 @@ #define UNIFYFS_SERVER_MAX_DATA_TX_SIZE (4 * MIB) /* to-client transmit size */ #define UNIFYFS_SERVER_MAX_NUM_APPS 64 /* max # apps/mountpoints supported */ #define UNIFYFS_SERVER_MAX_APP_CLIENTS 256 /* max # clients per application */ -#define UNIFYFS_SERVER_MAX_READS 2000 /* max server read reqs per reqmgr */ +#define UNIFYFS_SERVER_MAX_READS 2048 /* max # server read reqs per reqmgr */ // Utilities #define UNIFYFS_DEFAULT_INIT_TIMEOUT 120 /* server init timeout (seconds) */ diff --git a/docs/dependencies.rst b/docs/dependencies.rst index b40f88fab..9cd0ae400 100644 --- a/docs/dependencies.rst +++ b/docs/dependencies.rst @@ -6,16 +6,16 @@ UnifyFS Dependencies Required -------- -- `Automake `_ version 1.15 or later +- `Automake `_ version 1.15 (or later) -- `GOTCHA `_ version 1.0.3 +- `GOTCHA `_ version 1.0.3 (or later) -- `Margo `_ version 0.9.1 and its dependencies: +- `Margo `_ version 0.9.6 (or later) and its dependencies: - - `Argobots `_ version 1.0.1 - - `Mercury `_ version 2.0.0 + - `Argobots `_ version 1.1 (or later) + - `Mercury `_ version 2.0.1 (or later) - - `libfabric `_ and/or `bmi `_ + - `libfabric `_ or `bmi `_ - `JSON-C `_ @@ -25,9 +25,8 @@ Required Margo uses pkg-config to ensure it compiles and links correctly with all of its dependencies' libraries. When building manually, you'll need to set the - ``PKG_CONFIG_PATH`` environment variable and include in - that variable the paths for the ``.pc`` files for Mercury, Argobots, and - Margo separated by colons. + ``PKG_CONFIG_PATH`` environment variable to include the paths of the + directories containing the ``.pc`` files for Mercury, Argobots, and Margo. -------- Optional diff --git a/server/src/extent_tree.c b/server/src/extent_tree.c index 4ff4aead8..48a5d7bb4 100644 --- a/server/src/extent_tree.c +++ b/server/src/extent_tree.c @@ -26,109 +26,94 @@ #undef MAX #define MAX(a, b) (a > b ? a : b) -int compare_func( - struct extent_tree_node* node1, - struct extent_tree_node* node2) +int etn_compare_func(struct extent_tree_node* node1, + struct extent_tree_node* node2) { - if (node1->start > node2->end) { + if (node1->extent.start > node2->extent.end) { return 1; - } else if (node1->end < node2->start) { + } else if (node1->extent.end < node2->extent.start) { return -1; } else { + /* any overlap is considered as "equal" */ return 0; } } -RB_PROTOTYPE(ext_tree, extent_tree_node, entry, compare_func) -RB_GENERATE(ext_tree, extent_tree_node, entry, compare_func) +RB_PROTOTYPE(ext_tree, extent_tree_node, entry, etn_compare_func) +RB_GENERATE(ext_tree, extent_tree_node, entry, etn_compare_func) /* Returns 0 on success, positive non-zero error code otherwise */ -int extent_tree_init(struct extent_tree* extent_tree) +int extent_tree_init(struct extent_tree* tree) { - memset(extent_tree, 0, sizeof(*extent_tree)); - pthread_rwlock_init(&extent_tree->rwlock, NULL); - RB_INIT(&extent_tree->head); + memset(tree, 0, sizeof(*tree)); + pthread_rwlock_init(&(tree->rwlock), NULL); + RB_INIT(&(tree->head)); return 0; } /* * Remove and free all nodes in the extent_tree. */ -void extent_tree_destroy(struct extent_tree* extent_tree) +void extent_tree_destroy(struct extent_tree* tree) { - extent_tree_clear(extent_tree); - pthread_rwlock_destroy(&extent_tree->rwlock); + extent_tree_clear(tree); + pthread_rwlock_destroy(&(tree->rwlock)); } /* Allocate a node for the range tree. Free node with free() when finished */ -static struct extent_tree_node* extent_tree_node_alloc( - unsigned long start, /* logical starting offset of extent */ - unsigned long end, /* logical ending offset of extent */ - int svr_rank, /* rank of server hosting data */ - int app_id, /* application id (namespace) on server rank */ - int cli_id, /* client rank on server rank */ - unsigned long pos) /* physical offset of data in log */ +static +struct extent_tree_node* extent_tree_node_alloc(extent_metadata* extent) { /* allocate a new node structure */ struct extent_tree_node* node = calloc(1, sizeof(*node)); - if (!node) { - return NULL; + if (NULL != node) { + memcpy(&(node->extent), extent, sizeof(*extent)); } - - /* record logical range and physical offset */ - node->start = start; - node->end = end; - node->svr_rank = svr_rank; - node->app_id = app_id; - node->cli_id = cli_id; - node->pos = pos; - return node; } /* * Given two start/end ranges, return a new range from start1/end1 that - * does not overlap start2/end2. The non-overlapping range is stored - * in new_start/new_end. If there are no non-overlapping ranges, - * return 1 from this function, else return 0. If there are two - * non-overlapping ranges, return the first one in new_start/new_end. + * does not overlap start2/end2. The non-overlapping range is stored + * in range_start/range_end. If there are no non-overlapping ranges, + * return 1 from this function, else return 0. If there are two + * non-overlapping ranges, return the first one in range_start/range_end. */ static int get_non_overlapping_range( unsigned long start1, unsigned long end1, - long start2, long end2, - long* new_start, long* new_end) + unsigned long start2, unsigned long end2, + unsigned long* range_start, unsigned long* range_end) { /* this function is only called when we know that segment 1 * and segment 2 overlap with each other, find first portion * of segment 1 that does not overlap with segment 2, if any */ if (start1 < start2) { - /* Segment 1 inlcudes a portion before segment 2 starts - * return start/end of that leading portion of segment 1 + /* Segment 1 includes a portion before segment 2 starts. + * Set range to start/end of that leading portion of segment 1 * * s1-------e1 * s2--------e2 * ---- non-overlap */ - *new_start = start1; - *new_end = start2 - 1; + *range_start = start1; + *range_end = start2 - 1; return 0; } else if (end1 > end2) { /* Segment 1 does not start before segment 2, - * but segment 1 extends past end of segment 2 - * return start/end of trailing portion of segment 1 + * but segment 1 extends past end of segment 2. + * Set range to start/end of trailing portion of segment 1 * * s1-----e1 * s2-------e2 * --- non-overlap */ - *new_start = end2 + 1; - *new_end = end1; + *range_start = end2 + 1; + *range_end = end1; return 0; } - /* Segment 2 completely envelops segment 1 - * nothing left of segment 1 to return - * so return 1 to indicate this case + /* Segment 2 completely envelops segment 1. + * Return 1 to indicate this case * * s1-------e1 * s2-------------e2 @@ -139,142 +124,145 @@ static int get_non_overlapping_range( /* * Add an entry to the range tree. Returns 0 on success, nonzero otherwise. */ -int extent_tree_add( - struct extent_tree* extent_tree, /* tree to add new extent item */ - unsigned long start, /* logical starting offset of extent */ - unsigned long end, /* logical ending offset of extent */ - int svr_rank, /* rank of server hosting data */ - int app_id, /* application id (namespace) on server rank */ - int cli_id, /* client rank on server rank */ - unsigned long pos) /* physical offset of data in log */ +int extent_tree_add(struct extent_tree* tree, + struct extent_metadata* extent) { /* assume we'll succeed */ - int rc = 0; + int ret = 0; /* Create node to define our new range */ - struct extent_tree_node* node = extent_tree_node_alloc( - start, end, svr_rank, app_id, cli_id, pos); + struct extent_tree_node* node = extent_tree_node_alloc(extent); if (!node) { return ENOMEM; } /* lock the tree so we can modify it */ - extent_tree_wrlock(extent_tree); + extent_tree_wrlock(tree); /* Try to insert our range into the RB tree. If it overlaps with any other * range, then it is not inserted, and the overlapping range node is - * returned in 'overlap'. If 'overlap' is NULL, then there were no + * returned in 'conflict'. If 'conflict' is NULL, then there were no * overlaps, and our range was successfully inserted. */ - struct extent_tree_node* overlap; - while ((overlap = RB_INSERT(ext_tree, &extent_tree->head, node))) { - /* Our range overlaps with another range (in 'overlap'). Is there any - * any part of 'overlap' that does not overlap our range? If so, - * delete the old 'overlap' and insert the smaller, non-overlapping - * range. */ - long new_start = 0; - long new_end = 0; - int ret = get_non_overlapping_range(overlap->start, overlap->end, - start, end, &new_start, &new_end); - if (ret) { + struct extent_tree_node* conflict; + while ((conflict = RB_INSERT(ext_tree, &tree->head, node)) != NULL) { + /* Our range overlaps with another range (in 'conflict'). Is there any + * any part of 'conflict' that is outside our range? If so, delete + * the old 'conflict' and insert the smaller, non-overlapping range */ + unsigned long new_start = 0; + unsigned long new_end = 0; + unsigned long new_pos = 0; + int rc = get_non_overlapping_range(conflict->extent.start, + conflict->extent.end, + extent->start, extent->end, + &new_start, &new_end); + if (rc) { /* The new range we are adding completely covers the existing - * range in the tree defined in overlap. - * We can't find a non-overlapping range. + * range in the tree defined in 'conflict'. * Delete the existing range. */ - RB_REMOVE(ext_tree, &extent_tree->head, overlap); - free(overlap); - extent_tree->count--; + RB_REMOVE(ext_tree, &tree->head, conflict); + free(conflict); + tree->count--; } else { - /* Part of the old range was non-overlapping. Split the old range - * into two ranges: one for the non-overlapping section, and one for - * the remaining section. The non-overlapping section gets - * inserted without issue. The remaining section will be processed - * on the next pass of this while() loop. */ - struct extent_tree_node* resized = extent_tree_node_alloc( - new_start, new_end, - overlap->svr_rank, overlap->app_id, overlap->cli_id, - overlap->pos + (new_start - overlap->start)); - if (!resized) { + /* Part of the old range 'conflict' was non-overlapping. Create a + * new smaller extent for that non-overlap portion. */ + new_pos = conflict->extent.log_pos + + (new_start - conflict->extent.start); + extent_metadata non_overlap_extent = { + .start = new_start, + .end = new_end, + .log_pos = new_pos, + .svr_rank = conflict->extent.svr_rank, + .app_id = conflict->extent.app_id, + .cli_id = conflict->extent.cli_id + }; + struct extent_tree_node* non_overlap = + extent_tree_node_alloc(&non_overlap_extent); + if (NULL == non_overlap) { /* failed to allocate memory for range node, * bail out and release lock without further * changing state of extent tree */ free(node); - rc = ENOMEM; + ret = ENOMEM; goto release_add; } - /* if the non-overlapping part came from the front - * portion of the existing range, then there is a - * trailing portion of the existing range to add back - * to be considered again in the next loop iteration */ - struct extent_tree_node* remaining = NULL; - if (resized->end < overlap->end) { - /* There's still a remaining section after the non-overlapping - * part. Add it in. */ - remaining = extent_tree_node_alloc( - resized->end + 1, overlap->end, - overlap->svr_rank, overlap->app_id, overlap->cli_id, - overlap->pos + (resized->end + 1 - overlap->start)); - if (!remaining) { + /* If the non-overlapping part came from the front portion of + * 'conflict', create an extent that covers the tail portion */ + struct extent_tree_node* conflict_tail = NULL; + if (non_overlap_extent.end < conflict->extent.end) { + new_start = non_overlap_extent.end + 1; + new_end = conflict->extent.end; + new_pos = conflict->extent.log_pos + + (new_start - conflict->extent.start); + extent_metadata tail_extent = { + .start = new_start, + .end = new_end, + .log_pos = new_pos, + .svr_rank = conflict->extent.svr_rank, + .app_id = conflict->extent.app_id, + .cli_id = conflict->extent.cli_id + }; + conflict_tail = extent_tree_node_alloc(&tail_extent); + if (NULL == conflict_tail) { /* failed to allocate memory for range node, * bail out and release lock without further * changing state of extent tree */ free(node); - free(resized); - rc = ENOMEM; + free(non_overlap); + ret = ENOMEM; goto release_add; } } - /* Remove our old range and release it */ - RB_REMOVE(ext_tree, &extent_tree->head, overlap); - free(overlap); - extent_tree->count--; - - /* Insert the non-overlapping part of the new range */ - RB_INSERT(ext_tree, &extent_tree->head, resized); - extent_tree->count++; - - /* if we have a trailing portion, insert range for that, - * and increase our extent count since we just turned one - * range entry into two */ - if (remaining != NULL) { - RB_INSERT(ext_tree, &extent_tree->head, remaining); - extent_tree->count++; + /* Remove old range 'conflict' and release it */ + RB_REMOVE(ext_tree, &tree->head, conflict); + free(conflict); + tree->count--; + + /* Insert the non-overlapping part of the old range */ + RB_INSERT(ext_tree, &tree->head, non_overlap); + tree->count++; + + /* If we have a tail portion, insert it and increase our extent + * count since we just turned one range entry into two */ + if (conflict_tail != NULL) { + RB_INSERT(ext_tree, &tree->head, conflict_tail); + tree->count++; } } } /* increment segment count in the tree for the * new range we just added */ - extent_tree->count++; + tree->count++; /* update max ending offset if end of new range * we just inserted is larger */ - extent_tree->max = MAX(extent_tree->max, end); + tree->max = MAX(tree->max, extent->end); /* get temporary pointer to the node we just added */ struct extent_tree_node* target = node; /* check whether we can coalesce new extent with any preceding extent */ - struct extent_tree_node* prev = RB_PREV( - ext_tree, &extent_tree->head, target); - if (prev != NULL && prev->end + 1 == target->start) { - /* found a extent that ends just before the new extent starts, + struct extent_tree_node* prev = RB_PREV(ext_tree, &tree->head, target); + if ((NULL != prev) && (prev->extent.end + 1 == target->extent.start)) { + /* found an extent that ends just before the new extent starts, * check whether they are also contiguous in the log */ - unsigned long pos_end = prev->pos + (prev->end - prev->start + 1); - if (prev->svr_rank == target->svr_rank && - prev->cli_id == target->cli_id && - prev->app_id == target->app_id && - pos_end == target->pos) { + unsigned long pos_next = prev->extent.log_pos + + extent_length(&(prev->extent)); + if ((prev->extent.svr_rank == target->extent.svr_rank) && + (prev->extent.cli_id == target->extent.cli_id) && + (prev->extent.app_id == target->extent.app_id) && + (pos_next == target->extent.log_pos)) { /* the preceding extent describes a log position adjacent to * the extent we just added, so we can merge them, * append entry to previous by extending end of previous */ - prev->end = target->end; + prev->extent.end = target->extent.end; /* delete new extent from the tree and free it */ - RB_REMOVE(ext_tree, &extent_tree->head, target); + RB_REMOVE(ext_tree, &tree->head, target); free(target); - extent_tree->count--; + tree->count--; /* update target to point at previous extent since we just * merged our new extent into it */ @@ -283,56 +271,63 @@ int extent_tree_add( } /* check whether we can coalesce new extent with any trailing extent */ - struct extent_tree_node* next = RB_NEXT( - ext_tree, &extent_tree->head, target); - if (next != NULL && target->end + 1 == next->start) { + struct extent_tree_node* next = RB_NEXT(ext_tree, &tree->head, + target); + if ((NULL != next) && (target->extent.end + 1 == next->extent.start)) { /* found a extent that starts just after the new extent ends, * check whether they are also contiguous in the log */ - unsigned long pos_end = target->pos + (target->end - target->start + 1); - if (target->svr_rank == next->svr_rank && - target->cli_id == next->cli_id && - target->app_id == next->app_id && - pos_end == next->pos) { + unsigned long pos_next = target->extent.log_pos + + extent_length(&(target->extent)); + if (target->extent.svr_rank == next->extent.svr_rank && + target->extent.cli_id == next->extent.cli_id && + target->extent.app_id == next->extent.app_id && + pos_next == next->extent.log_pos) { /* the target extent describes a log position adjacent to * the next extent, so we can merge them, * append entry to target by extending end of to cover next */ - target->end = next->end; + target->extent.end = next->extent.end; /* delete next extent from the tree and free it */ - RB_REMOVE(ext_tree, &extent_tree->head, next); + RB_REMOVE(ext_tree, &tree->head, next); free(next); - extent_tree->count--; + tree->count--; } } release_add: /* done modifying the tree */ - extent_tree_unlock(extent_tree); + extent_tree_unlock(tree); - return rc; + return ret; } /* search tree for entry that overlaps with given start/end * offsets, return first overlapping entry if found, NULL otherwise, * assumes caller has lock on tree */ struct extent_tree_node* extent_tree_find( - struct extent_tree* extent_tree, /* tree to search */ + struct extent_tree* tree, /* tree to search */ unsigned long start, /* starting offset to search */ unsigned long end) /* ending offset to search */ { /* Create a range of just our starting byte offset */ - struct extent_tree_node* node = extent_tree_node_alloc( - start, start, 0, 0, 0, 0); - if (!node) { + extent_metadata start_byte = { + .start = start, + .end = start, + .log_pos = 0, + .svr_rank = 0, + .app_id = 0, + .cli_id = 0 + }; + struct extent_tree_node* node = extent_tree_node_alloc(&start_byte); + if (NULL == node) { return NULL; } /* search tree for either a range that overlaps with * the target range (starting byte), or otherwise the * node for the next biggest starting byte */ - struct extent_tree_node* next = RB_NFIND( - ext_tree, &extent_tree->head, node); + struct extent_tree_node* next = RB_NFIND(ext_tree, &tree->head, node); free(node); @@ -340,7 +335,7 @@ struct extent_tree_node* extent_tree_find( * byte offset, but it would be the range with the lowest * starting offset after the target starting offset, check whether * this overlaps our end offset */ - if (next && next->start <= end) { + if ((NULL != next) && (next->extent.start <= end)) { return next; } @@ -369,21 +364,21 @@ int extent_tree_truncate( /* iterate backwards until we find an extent below * the truncated size */ - while (node != NULL && node->end >= size) { + while ((NULL != node) && (node->extent.end >= size)) { /* found an extent whose ending offset is equal to or - * extends beyond the truncated size, + * extends beyond the truncated size. * check whether the full extent is beyond the truncated * size or whether the new size falls within this extent */ - if (node->start >= size) { + if (node->extent.start >= size) { /* the start offset is also beyond the truncated size, - * meaning the entire range is beyond the truncated size, - * get pointer to next previous extent in tree */ + * meaning the entire range is beyond the truncated size. + * get pointer to previous extent in tree */ struct extent_tree_node* oldnode = node; node = RB_PREV(ext_tree, &tree->head, node); /* remove this node from the tree and release it */ LOGDBG("removing node [%lu, %lu] due to truncate=%lu", - node->start, node->end, size); + node->extent.start, node->extent.end, size); RB_REMOVE(ext_tree, &tree->head, oldnode); free(oldnode); @@ -391,8 +386,9 @@ int extent_tree_truncate( tree->count--; } else { /* the range of this node overlaps with the truncated size - * so just update its end to be the new size */ - node->end = size - 1; + * so just update its end to be the new last byte offset */ + unsigned long last_byte = size - 1; + node->extent.end = last_byte; break; } } @@ -400,7 +396,7 @@ int extent_tree_truncate( /* update maximum offset in tree */ if (node != NULL) { /* got at least one extent left, update maximum field */ - tree->max = node->end; + tree->max = node->extent.end; } else { /* no extents left in the tree, set max back to 0 */ tree->max = 0; @@ -419,27 +415,27 @@ int extent_tree_truncate( * * This is meant to be called in a loop, like: * - * extent_tree_rdlock(extent_tree); + * extent_tree_rdlock(tree); * * struct extent_tree_node *node = NULL; - * while ((node = extent_tree_iter(extent_tree, node))) { - * printf("[%d-%d]", node->start, node->end); + * while ((node = extent_tree_iter(tree, node))) { + * printf("[%d-%d]", node->extent.start, node->extent.end); * } * - * extent_tree_unlock(extent_tree); + * extent_tree_unlock(tree); * * Note: this function does no locking, and assumes you're properly locking * and unlocking the extent_tree before doing the iteration (see * extent_tree_rdlock()/extent_tree_wrlock()/extent_tree_unlock()). */ struct extent_tree_node* extent_tree_iter( - struct extent_tree* extent_tree, + struct extent_tree* tree, struct extent_tree_node* start) { struct extent_tree_node* next = NULL; - if (start == NULL) { + if (NULL == start) { /* Initial case, no starting node */ - next = RB_MIN(ext_tree, &extent_tree->head); + next = RB_MIN(ext_tree, &tree->head); return next; } @@ -447,14 +443,14 @@ struct extent_tree_node* extent_tree_iter( * We were given a valid start node. Look it up to start our traversal * from there. */ - next = RB_FIND(ext_tree, &extent_tree->head, start); - if (!next) { + next = RB_FIND(ext_tree, &tree->head, start); + if (NULL == next) { /* Some kind of error */ return NULL; } /* Look up our next node */ - next = RB_NEXT(ext_tree, &extent_tree->head, start); + next = RB_NEXT(ext_tree, &tree->head, start); return next; } @@ -464,9 +460,9 @@ struct extent_tree_node* extent_tree_iter( * extent_tree_iter(). All the other extent_tree functions provide their * own locking. */ -void extent_tree_rdlock(struct extent_tree* extent_tree) +void extent_tree_rdlock(struct extent_tree* tree) { - int rc = pthread_rwlock_rdlock(&extent_tree->rwlock); + int rc = pthread_rwlock_rdlock(&(tree->rwlock)); if (rc) { LOGERR("pthread_rwlock_rdlock() failed - rc=%d", rc); } @@ -477,9 +473,9 @@ void extent_tree_rdlock(struct extent_tree* extent_tree) * extent_tree_iter(). All the other extent_tree functions provide their * own locking. */ -void extent_tree_wrlock(struct extent_tree* extent_tree) +void extent_tree_wrlock(struct extent_tree* tree) { - int rc = pthread_rwlock_wrlock(&extent_tree->rwlock); + int rc = pthread_rwlock_wrlock(&(tree->rwlock)); if (rc) { LOGERR("pthread_rwlock_wrlock() failed - rc=%d", rc); } @@ -490,9 +486,9 @@ void extent_tree_wrlock(struct extent_tree* extent_tree) * extent_tree_iter(). All the other extent_tree functions provide their * own locking. */ -void extent_tree_unlock(struct extent_tree* extent_tree) +void extent_tree_unlock(struct extent_tree* tree) { - int rc = pthread_rwlock_unlock(&extent_tree->rwlock); + int rc = pthread_rwlock_unlock(&(tree->rwlock)); if (rc) { LOGERR("pthread_rwlock_unlock() failed - rc=%d", rc); } @@ -502,68 +498,68 @@ void extent_tree_unlock(struct extent_tree* extent_tree) * Remove all nodes in extent_tree, but keep it initialized so you can * extent_tree_add() to it. */ -void extent_tree_clear(struct extent_tree* extent_tree) +void extent_tree_clear(struct extent_tree* tree) { struct extent_tree_node* node = NULL; struct extent_tree_node* oldnode = NULL; - extent_tree_wrlock(extent_tree); + extent_tree_wrlock(tree); - if (RB_EMPTY(&extent_tree->head)) { + if (RB_EMPTY(&tree->head)) { /* extent_tree is empty, nothing to do */ - extent_tree_unlock(extent_tree); + extent_tree_unlock(tree); return; } /* Remove and free each node in the tree */ - while ((node = extent_tree_iter(extent_tree, node))) { - if (oldnode) { - RB_REMOVE(ext_tree, &extent_tree->head, oldnode); + while ((node = extent_tree_iter(tree, node)) != NULL) { + if (NULL != oldnode) { + RB_REMOVE(ext_tree, &tree->head, oldnode); free(oldnode); } oldnode = node; } - if (oldnode) { - RB_REMOVE(ext_tree, &extent_tree->head, oldnode); + if (NULL != oldnode) { + RB_REMOVE(ext_tree, &tree->head, oldnode); free(oldnode); } - extent_tree->count = 0; - extent_tree->max = 0; - extent_tree_unlock(extent_tree); + tree->count = 0; + tree->max = 0; + extent_tree_unlock(tree); } /* Return the number of segments in the segment tree */ -unsigned long extent_tree_count(struct extent_tree* extent_tree) +unsigned long extent_tree_count(struct extent_tree* tree) { - extent_tree_rdlock(extent_tree); - unsigned long count = extent_tree->count; - extent_tree_unlock(extent_tree); + extent_tree_rdlock(tree); + unsigned long count = tree->count; + extent_tree_unlock(tree); return count; } /* Return the maximum ending logical offset in the tree */ -unsigned long extent_tree_max_offset(struct extent_tree* extent_tree) +unsigned long extent_tree_max_offset(struct extent_tree* tree) { - extent_tree_rdlock(extent_tree); - unsigned long max = extent_tree->max; - extent_tree_unlock(extent_tree); + extent_tree_rdlock(tree); + unsigned long max = tree->max; + extent_tree_unlock(tree); return max; } -/* given an extent tree and starting and ending logical offsets, - * fill in key/value entries that overlap that range, returns at - * most max entries starting from lowest starting offset, - * sets outnum with actual number of entries returned */ +/* Given an extent tree and starting and ending logical offsets, + * fill in key/value entries that overlap that range. + * Returns at most max entries starting from lowest starting offset. + * Sets outnum with actual number of entries returned */ int extent_tree_span( - struct extent_tree* extent_tree, /* extent tree to search */ - int gfid, /* global file id we're looking in */ - unsigned long start, /* starting logical offset */ - unsigned long end, /* ending logical offset */ - int max, /* maximum number of key/vals to return */ - void* _keys, /* array of length max for output keys */ - void* _vals, /* array of length max for output values */ - int* outnum) /* number of entries returned */ + struct extent_tree* tree, /* extent tree to search */ + int gfid, /* global file id we're looking in */ + unsigned long start, /* starting logical offset */ + unsigned long end, /* ending logical offset */ + int max, /* maximum number of key/vals to return */ + void* _keys, /* array of length max for output keys */ + void* _vals, /* array of length max for output values */ + int* outnum) /* number of entries returned */ { unifyfs_key_t* keys = (unifyfs_key_t*) _keys; unifyfs_val_t* vals = (unifyfs_val_t*) _vals; @@ -572,40 +568,40 @@ int extent_tree_span( *outnum = 0; /* lock the tree for reading */ - extent_tree_rdlock(extent_tree); + extent_tree_rdlock(tree); int count = 0; - struct extent_tree_node* next = extent_tree_find(extent_tree, start, end); - while (next != NULL && - next->start <= end && - count < max) { - /* got an entry that overlaps with given span */ + struct extent_tree_node* next = extent_tree_find(tree, start, end); + while ((NULL != next) && + (next->extent.start <= end) && + (count < max)) { + /* got an entry that overlaps with given range */ /* fill in key */ unifyfs_key_t* key = &keys[count]; key->gfid = gfid; - key->offset = next->start; + key->offset = next->extent.start; /* fill in value */ unifyfs_val_t* val = &vals[count]; - val->addr = next->pos; - val->len = next->end - next->start + 1; - val->delegator_rank = next->svr_rank; - val->app_id = next->app_id; - val->rank = next->cli_id; + val->addr = next->extent.log_pos; + val->len = next->extent.end - next->extent.start + 1; + val->delegator_rank = next->extent.svr_rank; + val->app_id = next->extent.app_id; + val->rank = next->extent.cli_id; /* increment the number of key/values we found */ count++; /* get the next element in the tree */ - next = extent_tree_iter(extent_tree, next); + next = extent_tree_iter(tree, next); } /* return to user the number of key/values we set */ *outnum = count; /* done reading the tree */ - extent_tree_unlock(extent_tree); + extent_tree_unlock(tree); return 0; } @@ -616,39 +612,39 @@ static void chunk_req_from_extent( struct extent_tree_node* n, chunk_read_req_t* chunk) { - unsigned long offset = n->start; - unsigned long nbytes = n->end - n->start + 1; - unsigned long log_offset = n->pos; - unsigned long last = req_offset + req_len - 1; + unsigned long offset = n->extent.start; + unsigned long nbytes = n->extent.end - n->extent.start + 1; + unsigned long log_offset = n->extent.log_pos; + unsigned long last = req_offset + req_len - 1; + unsigned long diff; if (offset < req_offset) { - unsigned long diff = req_offset - offset; - + diff = req_offset - offset; offset = req_offset; log_offset += diff; nbytes -= diff; } - if (n->end > last) { - unsigned long diff = n->end - last; + if (n->extent.end > last) { + diff = n->extent.end - last; nbytes -= diff; } - chunk->offset = offset; - chunk->nbytes = nbytes; - chunk->log_offset = log_offset; - chunk->rank = n->svr_rank; - chunk->log_client_id = n->cli_id; - chunk->log_app_id = n->app_id; + chunk->offset = offset; + chunk->nbytes = nbytes; + chunk->log_offset = log_offset; + chunk->rank = n->extent.svr_rank; + chunk->log_client_id = n->extent.cli_id; + chunk->log_app_id = n->extent.app_id; } int extent_tree_get_chunk_list( - struct extent_tree* extent_tree, /* extent tree to search */ - unsigned long offset, /* starting logical offset */ - unsigned long len, /* length of extent */ - unsigned int* n_chunks, /* [out] number of chunks returned */ - chunk_read_req_t** chunks, /* [out] chunk array */ - int* extent_covered) /* [out] set=1 if extent fully covered */ + struct extent_tree* tree, /* extent tree to search */ + unsigned long offset, /* starting logical offset */ + unsigned long len, /* length of extent */ + unsigned int* n_chunks, /* [out] number of chunks returned */ + chunk_read_req_t** chunks, /* [out] chunk array */ + int* extent_covered) /* [out] set=1 if extent fully covered */ { int ret = 0; unsigned int count = 0; @@ -663,27 +659,27 @@ int extent_tree_get_chunk_list( *extent_covered = 0; - extent_tree_rdlock(extent_tree); + extent_tree_rdlock(tree); - first = extent_tree_find(extent_tree, offset, end); + first = extent_tree_find(tree, offset, end); next = first; - while (next && next->start <= end) { + while ((NULL != next) && next->extent.start <= end) { count++; if (!gap_found) { - unsigned long curr_start = next->start; + unsigned long curr_start = next->extent.start; if (next != first) { /* check for a gap between current and previous extent */ if ((prev_end + 1) != curr_start) { gap_found = true; } } - prev_end = next->end; + prev_end = next->extent.end; } /* iterate to next extent */ last = next; - next = extent_tree_iter(extent_tree, next); + next = extent_tree_iter(tree, next); } *n_chunks = count; @@ -691,32 +687,32 @@ int extent_tree_get_chunk_list( gap_found = true; goto out_unlock; } else { - if ((first->start > offset) || (last->end < end)) { + if ((first->extent.start > offset) || (last->extent.end < end)) { gap_found = true; } } out_chunks = calloc(count, sizeof(*out_chunks)); - if (!out_chunks) { + if (NULL == out_chunks) { ret = ENOMEM; goto out_unlock; } next = first; current = out_chunks; - while (next && next->start <= end) { + while ((NULL != next) && (next->extent.start <= end)) { /* trim out the extent so it does not include the data that is not * requested */ chunk_req_from_extent(offset, len, next, current); - next = extent_tree_iter(extent_tree, next); + next = extent_tree_iter(tree, next); current += 1; } *chunks = out_chunks; out_unlock: - extent_tree_unlock(extent_tree); + extent_tree_unlock(tree); if (!gap_found) { *extent_covered = 1; diff --git a/server/src/extent_tree.h b/server/src/extent_tree.h index ac53ba61e..55f38aca4 100644 --- a/server/src/extent_tree.h +++ b/server/src/extent_tree.h @@ -17,22 +17,29 @@ #include "unifyfs_global.h" +typedef struct extent_metadata { + /* extent metadata */ + unsigned long start; /* logical offset of extent's first byte */ + unsigned long end; /* logical offset of extent's last byte */ + + /* logio metadata */ + unsigned long log_pos; /* physical offset of data in log */ + int svr_rank; /* rank of server hosting the log */ + int app_id; /* application id (namespace) of client */ + int cli_id; /* rank (on host server) of client */ +} extent_metadata; + +#define extent_length(meta_ptr) \ + ((size_t)1 + ((meta_ptr)->end - (meta_ptr)->start)) + +#define extent_offset(meta_ptr) \ + (off_t)((meta_ptr)->start) + struct extent_tree_node { RB_ENTRY(extent_tree_node) entry; - unsigned long start; /* starting logical offset of range */ - unsigned long end; /* ending logical offset of range */ - int svr_rank; /* rank of server hosting data */ - int app_id; /* application id (namespace) on server rank */ - int cli_id; /* client rank on server rank */ - unsigned long pos; /* physical offset of data in log */ + struct extent_metadata extent; }; -#define extent_tree_node_offset(node_ptr) \ - ((off_t)(node_ptr)->start) - -#define extent_tree_node_length(node_ptr) \ - ((size_t)1 + ((node_ptr)->end - (node_ptr)->start)) - struct extent_tree { RB_HEAD(ext_tree, extent_tree_node) head; pthread_rwlock_t rwlock; @@ -41,47 +48,40 @@ struct extent_tree { }; /* Returns 0 on success, positive non-zero error code otherwise */ -int extent_tree_init(struct extent_tree* extent_tree); +int extent_tree_init(struct extent_tree* tree); /* - * Remove all nodes in extent_tree, but keep it initialized so you can + * Remove all nodes in tree, but keep it initialized so you can * extent_tree_add() to it. */ -void extent_tree_clear(struct extent_tree* extent_tree); +void extent_tree_clear(struct extent_tree* tree); /* - * Remove and free all nodes in the extent_tree. + * Remove and free all nodes in the tree. */ -void extent_tree_destroy(struct extent_tree* extent_tree); +void extent_tree_destroy(struct extent_tree* tree); /* * Add an entry to the range tree. Returns 0 on success, nonzero otherwise. */ -int extent_tree_add( - struct extent_tree* extent_tree, /* tree to add new extent item */ - unsigned long start, /* logical starting offset of extent */ - unsigned long end, /* logical ending offset of extent */ - int svr_rank, /* rank of server hosting data */ - int app_id, /* application id (namespace) on server rank */ - int cli_id, /* client rank on server rank */ - unsigned long pos /* physical offset of data in log */ -); +int extent_tree_add(struct extent_tree* tree, + struct extent_metadata* extent); /* search tree for entry that overlaps with given start/end * offsets, return first overlapping entry if found, NULL otherwise, * assumes caller has lock on tree */ struct extent_tree_node* extent_tree_find( - struct extent_tree* extent_tree, /* tree to search */ - unsigned long start, /* starting offset to search */ - unsigned long end /* ending offset to search */ + struct extent_tree* tree, /* tree to search */ + unsigned long start, /* starting offset to search */ + unsigned long end /* ending offset to search */ ); /* truncate extents to use new maximum, discards extent entries * that exceed the new truncated size, and rewrites any entry * that overlaps */ int extent_tree_truncate( - struct extent_tree* extent_tree, /* tree to truncate */ - unsigned long size /* size to truncate extents to */ + struct extent_tree* tree, /* tree to truncate */ + unsigned long size /* size to truncate extents to */ ); /* @@ -91,41 +91,41 @@ int extent_tree_truncate( * * This is meant to be called in a loop, like: * - * extent_tree_rdlock(extent_tree); + * extent_tree_rdlock(tree); * * struct extent_tree_node *node = NULL; - * while ((node = extent_tree_iter(extent_tree, node))) { + * while ((node = extent_tree_iter(tree, node))) { * printf("[%d-%d]", node->start, node->end); * } * - * extent_tree_unlock(extent_tree); + * extent_tree_unlock(tree); * * Note: this function does no locking, and assumes you're properly locking - * and unlocking the extent_tree before doing the iteration (see + * and unlocking the tree before doing the iteration (see * extent_tree_rdlock()/extent_tree_wrlock()/extent_tree_unlock()). */ struct extent_tree_node* extent_tree_iter( - struct extent_tree* extent_tree, + struct extent_tree* tree, struct extent_tree_node* start); /* Return the number of segments in the segment tree */ -unsigned long extent_tree_count(struct extent_tree* extent_tree); +unsigned long extent_tree_count(struct extent_tree* tree); /* Return the maximum ending logical offset in the tree */ -unsigned long extent_tree_max_offset(struct extent_tree* extent_tree); +unsigned long extent_tree_max_offset(struct extent_tree* tree); /* * Locking functions for use with extent_tree_iter(). They allow you to * lock the tree to iterate over it: * - * extent_tree_rdlock(&extent_tree); + * extent_tree_rdlock(&tree); * * struct extent_tree_node *node = NULL; - * while ((node = extent_tree_iter(extent_tree, node))) { + * while ((node = extent_tree_iter(tree, node))) { * printf("[%d-%d]", node->start, node->end); * } * - * extent_tree_unlock(&extent_tree); + * extent_tree_unlock(&tree); */ /* @@ -133,62 +133,62 @@ unsigned long extent_tree_max_offset(struct extent_tree* extent_tree); * extent_tree_iter(). All the other extent_tree functions provide their * own locking. */ -void extent_tree_rdlock(struct extent_tree* extent_tree); +void extent_tree_rdlock(struct extent_tree* tree); /* * Lock a extent_tree for read/write. This should only be used for calling * extent_tree_iter(). All the other extent_tree functions provide their * own locking. */ -void extent_tree_wrlock(struct extent_tree* extent_tree); +void extent_tree_wrlock(struct extent_tree* tree); /* * Unlock a extent_tree for read/write. This should only be used for calling * extent_tree_iter(). All the other extent_tree functions provide their * own locking. */ -void extent_tree_unlock(struct extent_tree* extent_tree); +void extent_tree_unlock(struct extent_tree* tree); /* given an extent tree and starting and ending logical offsets, * fill in key/value entries that overlap that range, returns at * most max entries starting from lowest starting offset, * sets outnum with actual number of entries returned */ int extent_tree_span( - struct extent_tree* extent_tree, /* extent tree to search */ - int gfid, /* global file id we're looking in */ - unsigned long start, /* starting logical offset */ - unsigned long end, /* ending logical offset */ - int max, /* maximum number of key/vals to return */ - void* keys, /* array of length max for output keys */ - void* vals, /* array of length max for output values */ - int* outnum); /* number of entries returned */ + struct extent_tree* tree, /* extent tree to search */ + int gfid, /* global file id we're looking in */ + unsigned long start, /* starting logical offset */ + unsigned long end, /* ending logical offset */ + int max, /* maximum number of key/vals to return */ + void* keys, /* array of length max for output keys */ + void* vals, /* array of length max for output values */ + int* outnum); /* number of entries returned */ int extent_tree_get_chunk_list( - struct extent_tree* extent_tree, /* extent tree to search */ - unsigned long offset, /* starting logical offset */ - unsigned long len, /* length of extent */ - unsigned int* n_chunks, /* [out] number of chunks returned */ - chunk_read_req_t** chunks, /* [out] chunk array */ - int* extent_covered); /* [out] set=1 if extent fully covered */ + struct extent_tree* tree, /* extent tree to search */ + unsigned long offset, /* starting logical offset */ + unsigned long len, /* length of extent */ + unsigned int* n_chunks, /* [out] number of chunks returned */ + chunk_read_req_t** chunks, /* [out] chunk array */ + int* extent_covered); /* [out] set=1 if extent fully covered */ /* dump method for debugging extent trees */ static inline -void extent_tree_dump(struct extent_tree* extent_tree) +void extent_tree_dump(struct extent_tree* tree) { - if (NULL == extent_tree) { + if (NULL == tree) { return; } - extent_tree_rdlock(extent_tree); + extent_tree_rdlock(tree); struct extent_tree_node* node = NULL; - while ((node = extent_tree_iter(extent_tree, node))) { + while ((node = extent_tree_iter(tree, node))) { LOGDBG("[%lu-%lu] @ %d(%d:%d) log offset %lu", - node->start, node->end, node->svr_rank, - node->app_id, node->cli_id, node->pos); + node->extent.start, node->extent.end, node->extent.svr_rank, + node->extent.app_id, node->extent.cli_id, node->extent.log_pos); } - extent_tree_unlock(extent_tree); + extent_tree_unlock(tree); } #endif /* __EXTENT_TREE_H__ */ diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 2f2eb218f..23d9abff3 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -28,8 +28,8 @@ ServerRpcContext_t* unifyfsd_rpc_context; bool margo_use_tcp = true; bool margo_lazy_connect; // = false -int margo_client_server_pool_sz = 4; -int margo_server_server_pool_sz = 4; +int margo_client_server_pool_sz = UNIFYFS_MARGO_POOL_SZ; +int margo_server_server_pool_sz = UNIFYFS_MARGO_POOL_SZ; int margo_use_progress_thread = 1; // records pmi rank, server address string, and server address @@ -655,6 +655,17 @@ static hg_handle_t create_client_handle(hg_id_t id, return handle; } +static int forward_to_client(hg_handle_t hdl, void* input_ptr) +{ + double timeout_msec = UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC; + hg_return_t hret = margo_forward_timed(hdl, input_ptr, timeout_msec); + if (hret != HG_SUCCESS) { + LOGERR("margo_forward_timed() failed - %s", HG_Error_to_string(hret)); + return UNIFYFS_ERROR_MARGO; + } + return UNIFYFS_SUCCESS; +} + /* invokes the heartbeat rpc function */ int invoke_client_heartbeat_rpc(int app_id, int client_id) @@ -678,12 +689,11 @@ int invoke_client_heartbeat_rpc(int app_id, /* call rpc function */ LOGDBG("invoking the heartbeat rpc function in client[%d:%d]", app_id, client_id); - double timeout_msec = 500; /* half a second */ - hret = margo_forward_timed(handle, &in, timeout_msec); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward_timed() failed"); + int rc = forward_to_client(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of heartbeat rpc to client failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ @@ -747,11 +757,11 @@ int invoke_client_mread_req_data_rpc(int app_id, /* call rpc function */ LOGDBG("invoking the mread[%d] req data (index=%d) rpc function in " "client[%d:%d]", mread_id, read_index, app_id, client_id); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_client(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of mread-req-data rpc to client failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ @@ -806,11 +816,11 @@ int invoke_client_mread_req_complete_rpc(int app_id, /* call rpc function */ LOGDBG("invoking the mread[%d] complete rpc function in client[%d:%d]", mread_id, app_id, client_id); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_client(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of mread-complete rpc to client failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ @@ -859,11 +869,11 @@ int invoke_client_transfer_complete_rpc(int app_id, /* call rpc function */ LOGDBG("invoking the transfer[%d] complete rpc function in client[%d:%d]", transfer_id, app_id, client_id); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); + int rc = forward_to_client(handle, &in); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of transfer-complete rpc to client failed"); margo_destroy(handle); - return UNIFYFS_ERROR_MARGO; + return rc; } /* decode response */ diff --git a/server/src/margo_server.h b/server/src/margo_server.h index 11a577b9a..ecff8bafb 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -68,6 +68,8 @@ extern ServerRpcContext_t* unifyfsd_rpc_context; extern bool margo_use_tcp; extern bool margo_lazy_connect; +extern int margo_client_server_pool_sz; +extern int margo_server_server_pool_sz; int margo_server_rpc_init(void); int margo_server_rpc_finalize(void); diff --git a/server/src/unifyfs_fops_rpc.c b/server/src/unifyfs_fops_rpc.c index e74c1591d..bbd555112 100644 --- a/server/src/unifyfs_fops_rpc.c +++ b/server/src/unifyfs_fops_rpc.c @@ -86,22 +86,21 @@ int rpc_fsync(unifyfs_fops_ctx_t* ctx, /* the sync rpc now contains extents from a single file/gfid */ assert(gfid == index_entry[0].gfid); - struct extent_tree_node* extents = calloc(num_extents, sizeof(*extents)); + extent_metadata* extents = calloc(num_extents, sizeof(*extents)); if (NULL == extents) { LOGERR("failed to allocate memory for local_extents"); return ENOMEM; } for (i = 0; i < num_extents; i++) { - struct extent_tree_node* extent = extents + i; unifyfs_index_t* meta = index_entry + i; - - extent->start = meta->file_pos; - extent->end = (meta->file_pos + meta->length) - 1; + extent_metadata* extent = extents + i; + extent->start = meta->file_pos; + extent->end = (meta->file_pos + meta->length) - 1; extent->svr_rank = glb_pmi_rank; - extent->app_id = ctx->app_id; - extent->cli_id = ctx->client_id; - extent->pos = meta->log_pos; + extent->app_id = ctx->app_id; + extent->cli_id = ctx->client_id; + extent->log_pos = meta->log_pos; } /* update local inode state first */ @@ -244,7 +243,7 @@ int create_remote_read_requests(unsigned int n_chunks, static int submit_read_request(unifyfs_fops_ctx_t* ctx, unsigned int count, - unifyfs_inode_extent_t* extents) + unifyfs_extent_t* extents) { if ((count == 0) || (NULL == extents)) { return EINVAL; @@ -271,7 +270,7 @@ int submit_read_request(unifyfs_fops_ctx_t* ctx, int ret = UNIFYFS_SUCCESS; unsigned int extent_ndx = 0; for ( ; extent_ndx < count; extent_ndx++) { - unifyfs_inode_extent_t* ext = extents + extent_ndx; + unifyfs_extent_t* ext = extents + extent_ndx; unsigned int n_chunks = 0; chunk_read_req_t* chunks = NULL; int rc = unifyfs_invoke_find_extents_rpc(ext->gfid, 1, ext, @@ -296,14 +295,14 @@ int submit_read_request(unifyfs_fops_ctx_t* ctx, /* fill the information of server_read_req_t and submit */ server_read_req_t rdreq = { 0, }; - rdreq.app_id = app_id; - rdreq.client_id = client_id; - rdreq.client_mread = client_mread; - rdreq.client_read_ndx = extent_ndx; - rdreq.chunks = chunks; + rdreq.app_id = app_id; + rdreq.client_id = client_id; + rdreq.client_mread = client_mread; + rdreq.client_read_ndx = extent_ndx; + rdreq.chunks = chunks; rdreq.num_server_reads = (int) n_remote_reads; - rdreq.remote_reads = remote_reads; - rdreq.extent = *ext; + rdreq.remote_reads = remote_reads; + rdreq.extent = *ext; ret = rm_submit_read_request(&rdreq); } else { LOGDBG("extent(gfid=%d, offset=%lu, len=%lu) has no data", @@ -323,7 +322,7 @@ int rpc_read(unifyfs_fops_ctx_t* ctx, off_t offset, size_t length) { - unifyfs_inode_extent_t extent = { 0 }; + unifyfs_extent_t extent = { 0 }; extent.gfid = gfid; extent.offset = (unsigned long) offset; extent.length = (unsigned long) length; @@ -336,30 +335,9 @@ int rpc_mread(unifyfs_fops_ctx_t* ctx, size_t n_req, void* read_reqs) { - int ret = UNIFYFS_SUCCESS; - unsigned int i = 0; unsigned int count = (unsigned int) n_req; - unifyfs_inode_extent_t* extents = NULL; - unifyfs_extent_t* reqs = (unifyfs_extent_t*) read_reqs; - - extents = calloc(n_req, sizeof(*extents)); - if (NULL == extents) { - LOGERR("failed to allocate the chunk request"); - return ENOMEM; - } - - for (i = 0; i < count; i++) { - unifyfs_inode_extent_t* ext = extents + i; - unifyfs_extent_t* req = reqs + i; - ext->gfid = req->gfid; - ext->offset = (unsigned long) req->offset; - ext->length = (unsigned long) req->length; - } - - ret = submit_read_request(ctx, count, extents); - - free(extents); - return ret; + unifyfs_extent_t* extents = (unifyfs_extent_t*) read_reqs; + return submit_read_request(ctx, count, extents); } static struct unifyfs_fops _fops_rpc = { diff --git a/server/src/unifyfs_group_rpc.c b/server/src/unifyfs_group_rpc.c index d2f081422..997ceade4 100644 --- a/server/src/unifyfs_group_rpc.c +++ b/server/src/unifyfs_group_rpc.c @@ -19,7 +19,6 @@ # define UNIFYFS_BCAST_K_ARY 2 #endif - /* helper method to initialize collective request rpc handle for child peer */ static int get_child_request_handle(hg_id_t request_hgid, int peer_rank, @@ -37,8 +36,8 @@ static int get_child_request_handle(hg_id_t request_hgid, hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, addr, request_hgid, chdl); if (hret != HG_SUCCESS) { - LOGERR("failed to get handle for child request to server %d", - peer_rank); + LOGERR("failed to get handle for child request to server %d - %s", + peer_rank, HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } } @@ -54,27 +53,166 @@ static int forward_child_request(void* input_ptr, int ret = UNIFYFS_SUCCESS; /* call rpc function */ - hg_return_t hret = margo_iforward(chdl, input_ptr, creq); + double timeout_ms = UNIFYFS_MARGO_SERVER_SERVER_TIMEOUT_MSEC; + hg_return_t hret = margo_iforward_timed(chdl, input_ptr, timeout_ms, creq); if (hret != HG_SUCCESS) { - LOGERR("failed to forward request(%p)", creq); + LOGERR("failed to forward request(%p) - %s", creq, + HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } return ret; } -/* helper method to wait for collective rpc child request completion */ -static int wait_for_child_request(margo_request* creq) +static int get_child_response(coll_request* coll_req, + hg_handle_t chdl) { int ret = UNIFYFS_SUCCESS; + void* out = calloc(1, coll_req->output_sz); + if (NULL == out) { + ret = ENOMEM; + } else { + hg_return_t hret = margo_get_output(chdl, out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* update collective return value using child response */ + int child_ret = UNIFYFS_SUCCESS; + void* output = coll_req->output; - /* call rpc function */ - hg_return_t hret = margo_wait(*creq); - if (hret != HG_SUCCESS) { - LOGERR("wait on request(%p) failed", creq); - ret = UNIFYFS_ERROR_MARGO; + switch (coll_req->req_type) { + case UNIFYFS_SERVER_BCAST_RPC_EXTENTS: { + extent_bcast_out_t* cebo = (extent_bcast_out_t*) out; + extent_bcast_out_t* ebo = (extent_bcast_out_t*) output; + child_ret = cebo->ret; + if ((NULL != ebo) && (child_ret != UNIFYFS_SUCCESS)) { + ebo->ret = child_ret; + } + break; + } + case UNIFYFS_SERVER_BCAST_RPC_FILEATTR: { + fileattr_bcast_out_t* cfbo = (fileattr_bcast_out_t*) out; + fileattr_bcast_out_t* fbo = (fileattr_bcast_out_t*) output; + child_ret = cfbo->ret; + if ((NULL != fbo) && (child_ret != UNIFYFS_SUCCESS)) { + fbo->ret = child_ret; + } + break; + } + case UNIFYFS_SERVER_BCAST_RPC_LAMINATE: { + laminate_bcast_out_t* clbo = (laminate_bcast_out_t*) out; + laminate_bcast_out_t* lbo = (laminate_bcast_out_t*) output; + child_ret = clbo->ret; + if ((NULL != lbo) && (child_ret != UNIFYFS_SUCCESS)) { + lbo->ret = child_ret; + } + break; + } + case UNIFYFS_SERVER_BCAST_RPC_TRANSFER: { + transfer_bcast_out_t* ctbo = (transfer_bcast_out_t*) out; + transfer_bcast_out_t* tbo = (transfer_bcast_out_t*) output; + child_ret = ctbo->ret; + if ((NULL != tbo) && (child_ret != UNIFYFS_SUCCESS)) { + tbo->ret = child_ret; + } + break; + } + case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: { + truncate_bcast_out_t* ctbo = (truncate_bcast_out_t*) out; + truncate_bcast_out_t* tbo = (truncate_bcast_out_t*) output; + child_ret = ctbo->ret; + if ((NULL != tbo) && (child_ret != UNIFYFS_SUCCESS)) { + tbo->ret = child_ret; + } + break; + } + case UNIFYFS_SERVER_BCAST_RPC_UNLINK: { + unlink_bcast_out_t* cubo = (unlink_bcast_out_t*) out; + unlink_bcast_out_t* ubo = (unlink_bcast_out_t*) output; + child_ret = cubo->ret; + if ((NULL != ubo) && (child_ret != UNIFYFS_SUCCESS)) { + ubo->ret = child_ret; + } + break; + } + default: + child_ret = UNIFYFS_FAILURE; + LOGERR("invalid collective request type %d", + coll_req->req_type); + break; + } + + ret = child_ret; + + margo_free_output(chdl, out); + } + } + + return ret; +} + +static int wait_for_all_child_requests(coll_request* coll_req, + int n_children) +{ + if (NULL == coll_req) { + return EINVAL; } + if (n_children == 0) { + return UNIFYFS_SUCCESS; + } else if (NULL == coll_req->child_reqs) { + LOGERR("collective(%p) has %d children, but NULL child_reqs array", + coll_req, n_children); + return EINVAL; + } + + int ret = UNIFYFS_SUCCESS; + int n_complete = 0; + + /* use margo_wait_any() until all requests completed/errored */ + do { + size_t complete_ndx; + hg_return_t hret = margo_wait_any((size_t)n_children, + coll_req->child_reqs, + &complete_ndx); + if (HG_SUCCESS == hret) { + n_complete++; + hg_handle_t* chdl = coll_req->child_hdls + complete_ndx; + margo_request* creq = coll_req->child_reqs + complete_ndx; + + /* get the output of the rpc */ + int child_ret = get_child_response(coll_req, *chdl); + LOGDBG("BCAST_RPC: collective(%p) child[%zu] resp=%d", + coll_req, complete_ndx, child_ret); + if (child_ret != UNIFYFS_SUCCESS) { + ret = child_ret; + } + + /* set request to MARGO_REQUEST_NULL so that the next call to + * margo_wait_any() will ignore it */ + *creq = MARGO_REQUEST_NULL; + + /* release the handle for the completed request */ + margo_destroy(*chdl); + *chdl = HG_HANDLE_NULL; + } else { + LOGERR("margo_wait_any() failed with error code=%s", + HG_Error_to_string(hret)); + ret = UNIFYFS_ERROR_MARGO; + + for (int i = 0; i < n_children; i++) { + hg_handle_t* chdl = coll_req->child_hdls + i; + if (HG_HANDLE_NULL != *chdl) { + margo_destroy(*chdl); + *chdl = HG_HANDLE_NULL; + } + } + + break; /* out of do/while loop */ + } + } while (n_complete < n_children); + return ret; } @@ -275,140 +413,25 @@ void collective_set_local_retval(coll_request* coll_req, int val) } } -static int coll_get_child_response(coll_request* coll_req, - hg_handle_t chdl) -{ - int ret = UNIFYFS_SUCCESS; - void* out = calloc(1, coll_req->output_sz); - if (NULL == out) { - ret = ENOMEM; - } else { - hg_return_t hret = margo_get_output(chdl, out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; - } else { - /* update collective return value using child response */ - int child_ret = UNIFYFS_SUCCESS; - void* output = coll_req->output; - - switch (coll_req->req_type) { - case UNIFYFS_SERVER_BCAST_RPC_EXTENTS: { - extent_bcast_out_t* cebo = (extent_bcast_out_t*) out; - extent_bcast_out_t* ebo = (extent_bcast_out_t*) output; - child_ret = cebo->ret; - if (child_ret != UNIFYFS_SUCCESS) { - ebo->ret = child_ret; - } - break; - } - case UNIFYFS_SERVER_BCAST_RPC_FILEATTR: { - fileattr_bcast_out_t* cfbo = (fileattr_bcast_out_t*) out; - fileattr_bcast_out_t* fbo = (fileattr_bcast_out_t*) output; - child_ret = cfbo->ret; - if (child_ret != UNIFYFS_SUCCESS) { - fbo->ret = child_ret; - } - break; - } - case UNIFYFS_SERVER_BCAST_RPC_LAMINATE: { - laminate_bcast_out_t* clbo = (laminate_bcast_out_t*) out; - laminate_bcast_out_t* lbo = (laminate_bcast_out_t*) output; - child_ret = clbo->ret; - if (child_ret != UNIFYFS_SUCCESS) { - lbo->ret = child_ret; - } - break; - } - case UNIFYFS_SERVER_BCAST_RPC_TRANSFER: { - transfer_bcast_out_t* ctbo = (transfer_bcast_out_t*) out; - transfer_bcast_out_t* tbo = (transfer_bcast_out_t*) output; - child_ret = ctbo->ret; - if (child_ret != UNIFYFS_SUCCESS) { - tbo->ret = child_ret; - } - break; - } - case UNIFYFS_SERVER_BCAST_RPC_TRUNCATE: { - truncate_bcast_out_t* ctbo = (truncate_bcast_out_t*) out; - truncate_bcast_out_t* tbo = (truncate_bcast_out_t*) output; - child_ret = ctbo->ret; - if (child_ret != UNIFYFS_SUCCESS) { - tbo->ret = child_ret; - } - break; - } - case UNIFYFS_SERVER_BCAST_RPC_UNLINK: { - unlink_bcast_out_t* cubo = (unlink_bcast_out_t*) out; - unlink_bcast_out_t* ubo = (unlink_bcast_out_t*) output; - child_ret = cubo->ret; - if (child_ret != UNIFYFS_SUCCESS) { - ubo->ret = child_ret; - } - break; - } - default: - child_ret = UNIFYFS_FAILURE; - LOGERR("invalid collective request type %d", - coll_req->req_type); - break; - } - - ret = child_ret; - - margo_free_output(chdl, out); - } - } - - return ret; -} - /* Forward the collective request to any children */ static int collective_finish(coll_request* coll_req) { int ret = UNIFYFS_SUCCESS; - /* get info for tree */ - int child_count = coll_req->tree.child_count; - LOGDBG("BCAST_RPC: collective(%p) finish", coll_req); - if (child_count) { - /* wait for child requests to finish */ - int i, rc; - if (NULL != coll_req->child_reqs) { - margo_request* creq; - hg_handle_t* chdl; - /* TODO: use margo_wait_any() instead of our own loop */ - for (i = 0; i < child_count; i++) { - chdl = coll_req->child_hdls + i; - creq = coll_req->child_reqs + i; - rc = wait_for_child_request(creq); - if (rc == UNIFYFS_SUCCESS) { - /* get the output of the rpc */ - int child_ret = coll_get_child_response(coll_req, *chdl); - LOGDBG("BCAST_RPC: collective(%p) child[%d] resp=%d", - coll_req, i, child_ret); - if (child_ret != UNIFYFS_SUCCESS) { - ret = child_ret; - } - } else { - ret = rc; - } - margo_destroy(*chdl); - } - } else { - LOGERR("child count is %d, but NULL child reqs array", - child_count); - ret = UNIFYFS_FAILURE; - } + /* wait for responses from children */ + int child_count = coll_req->tree.child_count; + int rc = wait_for_all_child_requests(coll_req, child_count); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; } if (NULL != coll_req->output) { /* send output back to caller */ hg_return_t hret = margo_respond(coll_req->resp_hdl, coll_req->output); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } LOGDBG("BCAST_RPC: collective(%p, op=%d) responded", @@ -442,15 +465,18 @@ int invoke_bcast_progress_rpc(coll_request* coll_req) hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, addr, hgid, &handle); if (hret != HG_SUCCESS) { - LOGERR("failed to get handle for bcast progress"); + LOGERR("failed to get handle for bcast progress - %s", + HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { - /* call rpc function */ + /* call local rpc function, which allows progress to be handled + * by a ULT */ bcast_progress_in_t in; in.coll_req = (hg_ptr_t) coll_req; hret = margo_forward(handle, &in); if (hret != HG_SUCCESS) { - LOGERR("failed to forward bcast progress for coll(%p)", coll_req); + LOGERR("failed to forward bcast progress for coll(%p) - %s", + HG_Error_to_string(hret), coll_req); ret = UNIFYFS_ERROR_MARGO; } } @@ -467,7 +493,7 @@ static void bcast_progress_rpc(hg_handle_t handle) bcast_progress_in_t in; hg_return_t hret = margo_get_input(handle, &in); if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); + LOGERR("margo_get_input() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* call collective_finish() to progress bcast operation */ @@ -485,7 +511,7 @@ static void bcast_progress_rpc(hg_handle_t handle) out.ret = ret; hret = margo_respond(handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } /* free margo resources */ @@ -517,11 +543,11 @@ static void extent_bcast_rpc(hg_handle_t handle) /* get input params */ hg_return_t hret = margo_get_input(handle, in); if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); + LOGERR("margo_get_input() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { size_t num_extents = (size_t) in->num_extents; - size_t bulk_sz = num_extents * sizeof(struct extent_tree_node); + size_t bulk_sz = num_extents * sizeof(struct extent_metadata); hg_bulk_t local_bulk = HG_BULK_NULL; void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, bulk_sz, &local_bulk); @@ -564,7 +590,7 @@ static void extent_bcast_rpc(hg_handle_t handle) ebo.ret = (int32_t)ret; hg_return_t hret = margo_respond(handle, &ebo); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } if (NULL != coll) { @@ -585,7 +611,7 @@ int unifyfs_invoke_broadcast_extents_rpc(int gfid) LOGDBG("BCAST_RPC: starting extents for gfid=%d", gfid); size_t n_extents; - struct extent_tree_node* extents; + struct extent_metadata* extents; ret = unifyfs_inode_get_extents(gfid, &n_extents, &extents); if (ret != UNIFYFS_SUCCESS) { LOGERR("failed to get extents for gfid=%d", gfid); @@ -606,7 +632,7 @@ int unifyfs_invoke_broadcast_extents_rpc(int gfid) &buf, &buf_size, HG_BULK_READ_ONLY, &extents_bulk); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_bulk_create() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { coll_request* coll = NULL; @@ -668,11 +694,11 @@ static void laminate_bcast_rpc(hg_handle_t handle) /* get input params */ hg_return_t hret = margo_get_input(handle, in); if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); + LOGERR("margo_get_input() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { size_t n_extents = (size_t) in->num_extents; - size_t bulk_sz = n_extents * sizeof(struct extent_tree_node); + size_t bulk_sz = n_extents * sizeof(struct extent_metadata); hg_bulk_t local_bulk = HG_BULK_NULL; void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, bulk_sz, &local_bulk); @@ -715,7 +741,7 @@ static void laminate_bcast_rpc(hg_handle_t handle) lbo.ret = (int32_t)ret; hg_return_t hret = margo_respond(handle, &lbo); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } if (NULL != coll) { @@ -750,7 +776,7 @@ int unifyfs_invoke_broadcast_laminate(int gfid) LOGDBG("BCAST_RPC: starting laminate for gfid=%d", gfid); size_t n_extents; - struct extent_tree_node* extents; + struct extent_metadata* extents; ret = unifyfs_inode_get_extents(gfid, &n_extents, &extents); if (ret != UNIFYFS_SUCCESS) { LOGERR("failed to get extents for gfid=%d", gfid); @@ -767,7 +793,8 @@ int unifyfs_invoke_broadcast_laminate(int gfid) &buf, &buf_size, HG_BULK_READ_ONLY, &extents_bulk); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_bulk_create() failed - %s", + HG_Error_to_string(hret)); free(buf); return UNIFYFS_ERROR_MARGO; } @@ -833,7 +860,7 @@ static void transfer_bcast_rpc(hg_handle_t handle) /* get input params */ hg_return_t hret = margo_get_input(handle, in); if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); + LOGERR("margo_get_input() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.transfer_bcast_id; @@ -867,7 +894,7 @@ static void transfer_bcast_rpc(hg_handle_t handle) tbo.ret = (int32_t)ret; hg_return_t hret = margo_respond(handle, &tbo); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } if (NULL != coll) { @@ -967,7 +994,7 @@ static void truncate_bcast_rpc(hg_handle_t handle) /* get input params */ hg_return_t hret = margo_get_input(handle, in); if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); + LOGERR("margo_get_input() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.truncate_bcast_id; @@ -1001,7 +1028,7 @@ static void truncate_bcast_rpc(hg_handle_t handle) tbo.ret = (int32_t)ret; hg_return_t hret = margo_respond(handle, &tbo); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } if (NULL != coll) { @@ -1073,7 +1100,7 @@ static void fileattr_bcast_rpc(hg_handle_t handle) /* get input params */ hg_return_t hret = margo_get_input(handle, in); if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); + LOGERR("margo_get_input() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.fileattr_bcast_id; @@ -1107,7 +1134,7 @@ static void fileattr_bcast_rpc(hg_handle_t handle) fbo.ret = (int32_t)ret; hg_return_t hret = margo_respond(handle, &fbo); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } if (NULL != coll) { @@ -1180,7 +1207,7 @@ static void unlink_bcast_rpc(hg_handle_t handle) /* get input params */ hg_return_t hret = margo_get_input(handle, in); if (hret != HG_SUCCESS) { - LOGERR("margo_get_input() failed"); + LOGERR("margo_get_input() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.unlink_bcast_id; @@ -1214,7 +1241,7 @@ static void unlink_bcast_rpc(hg_handle_t handle) ubo.ret = (int32_t)ret; hg_return_t hret = margo_respond(handle, &ubo); if (hret != HG_SUCCESS) { - LOGERR("margo_respond() failed"); + LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } if (NULL != coll) { diff --git a/server/src/unifyfs_inode.c b/server/src/unifyfs_inode.c index 0596a0c55..12980a3be 100644 --- a/server/src/unifyfs_inode.c +++ b/server/src/unifyfs_inode.c @@ -146,19 +146,19 @@ int unifyfs_inode_destroy(struct unifyfs_inode* ino) struct extent_tree* tree = ino->extents; struct extent_tree_node* curr = NULL; while (NULL != (curr = extent_tree_iter(tree, curr))) { - if (curr->svr_rank == glb_pmi_rank) { + if (curr->extent.svr_rank == glb_pmi_rank) { /* lookup client's logio context and release * allocation for this extent */ - int app_id = curr->app_id; - int client_id = curr->cli_id; + int app_id = curr->extent.app_id; + int client_id = curr->extent.cli_id; app_client* client = get_app_client(app_id, client_id); if ((NULL == client) || (NULL == client->state.logio_ctx)) { continue; } logio_context* logio = client->state.logio_ctx; - size_t nbytes = (1 + (curr->end - curr->start)); - off_t log_off = curr->pos; + size_t nbytes = extent_length(&(curr->extent)); + off_t log_off = curr->extent.log_pos; int rc = unifyfs_logio_free(logio, log_off, nbytes); if (UNIFYFS_SUCCESS != rc) { LOGERR("failed to free logio allocation for " @@ -339,8 +339,9 @@ int unifyfs_inode_truncate(int gfid, unsigned long size) return ret; } -int unifyfs_inode_add_extents(int gfid, int num_extents, - struct extent_tree_node* nodes) +int unifyfs_inode_add_extents(int gfid, + int num_extents, + extent_metadata* extents) { int ret = UNIFYFS_SUCCESS; int i = 0; @@ -373,16 +374,8 @@ int unifyfs_inode_add_extents(int gfid, int num_extents, } for (i = 0; i < num_extents; i++) { - struct extent_tree_node* current = &nodes[i]; - - /* debug output becomes too noisy with this: - * LOGDBG("extent[%4d]: [%lu, %lu] @ server[%d] log(%d:%d:%lu)", - * i, current->start, current->end, current->svr_rank, - * current->app_id, current->cli_id, current->pos); - */ - ret = extent_tree_add(tree, current->start, current->end, - current->svr_rank, current->app_id, - current->cli_id, current->pos); + extent_metadata* current = extents + i; + ret = extent_tree_add(tree, current); if (ret) { LOGERR("failed to add extent [%lu, %lu] to gfid=%d", current->start, current->end, gfid); @@ -464,41 +457,41 @@ int unifyfs_inode_laminate(int gfid) return ret; } -int unifyfs_inode_get_extents(int gfid, size_t* n, - struct extent_tree_node** nodes) +int unifyfs_inode_get_extents(int gfid, + size_t* n, + extent_metadata** extents) { - int ret = UNIFYFS_SUCCESS; - struct unifyfs_inode* ino = NULL; - - if (!n || !nodes) { + if ((NULL == n) || (NULL == extents)) { return EINVAL; } + int ret = UNIFYFS_SUCCESS; + struct unifyfs_inode* ino = NULL; + unifyfs_inode_tree_rdlock(global_inode_tree); { ino = unifyfs_inode_tree_search(global_inode_tree, gfid); - if (!ino) { + if (NULL == ino) { ret = ENOENT; } else { unifyfs_inode_rdlock(ino); { - int i = 0; struct extent_tree* tree = ino->extents; - size_t n_nodes = tree->count; - struct extent_tree_node* _nodes = calloc(n_nodes, - sizeof(*_nodes)); - struct extent_tree_node* curr = NULL; - - if (!_nodes) { + size_t n_extents = tree->count; + extent_metadata* _extents = calloc(n_extents, + sizeof(*_extents)); + if (NULL == _extents) { ret = ENOMEM; } else { - while (NULL != (curr = extent_tree_iter(tree, curr))) { - _nodes[i] = *curr; + int i = 0; + struct extent_tree_node* curr = NULL; + while ((curr = extent_tree_iter(tree, curr)) != NULL) { + _extents[i] = curr->extent; i++; } - *n = n_nodes; - *nodes = _nodes; + *n = n_extents; + *extents = _extents; } } unifyfs_inode_unlock(ino); @@ -509,7 +502,7 @@ int unifyfs_inode_get_extents(int gfid, size_t* n, return ret; } -int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, +int unifyfs_inode_get_extent_chunks(unifyfs_extent_t* extent, unsigned int* n_chunks, chunk_read_req_t** chunks, int* full_coverage) @@ -531,7 +524,7 @@ int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, { if (NULL != ino->extents) { unsigned long offset = extent->offset; - unsigned long len = extent->length; + unsigned long len = extent->length; ret = extent_tree_get_chunk_list(ino->extents, offset, len, n_chunks, chunks, &covered); @@ -547,7 +540,7 @@ int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, unifyfs_inode_tree_unlock(global_inode_tree); if (ret == UNIFYFS_SUCCESS) { - /* extent_tree_get_chunk_list does not populate the gfid field */ + /* extent_tree_get_chunk_list() does not populate the gfid field */ for (unsigned int i = 0; i < *n_chunks; i++) { (*chunks)[i].gfid = gfid; } @@ -582,7 +575,7 @@ int compare_chunk_read_reqs(const void* _c1, const void* _c2) int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, - unifyfs_inode_extent_t* extents, + unifyfs_extent_t* extents, unsigned int* n_locs, chunk_read_req_t** chunklocs, int* full_coverage) @@ -613,7 +606,7 @@ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, /* resolve chunks addresses for all requests from inode tree */ for (i = 0; i < n_extents; i++) { - unifyfs_inode_extent_t* current = &extents[i]; + unifyfs_extent_t* current = &extents[i]; LOGDBG("resolving extent request [gfid=%d, offset=%lu, length=%lu]", current->gfid, current->offset, current->length); diff --git a/server/src/unifyfs_inode.h b/server/src/unifyfs_inode.h index c52c0f0a8..dbae7661e 100644 --- a/server/src/unifyfs_inode.h +++ b/server/src/unifyfs_inode.h @@ -123,25 +123,28 @@ int unifyfs_inode_truncate(int gfid, unsigned long size); /** * @brief get the local extent array from the target inode * - * @param gfid the global file identifier - * @param n the number of extents, set by this function - * @param nodes the pointer to the array of extents, caller should free this + * @param gfid the global file identifier + * @param n pointer to size of the extents array + * @param extents pointer to extents array (caller should free) * * @return 0 on success, errno otherwise */ -int unifyfs_inode_get_extents(int gfid, size_t* n, - struct extent_tree_node** nodes); +int unifyfs_inode_get_extents(int gfid, + size_t* n, + extent_metadata** extents); /** * @brief add new extents to the inode * - * @param gfid the global file identifier - * @param n the number of new extents in @nodes - * @param nodes an array of extents to be added + * @param gfid the global file identifier + * @param num_extents the number of new extents in @nodes + * @param extents an array of extents to be added * * @return */ -int unifyfs_inode_add_extents(int gfid, int n, struct extent_tree_node* nodes); +int unifyfs_inode_add_extents(int gfid, + int num_extents, + extent_metadata* extents); /** * @brief get the maximum file size from the local extent tree of given file @@ -166,15 +169,15 @@ int unifyfs_inode_laminate(int gfid); /** * @brief Get chunks for given file extent * - * @param extent target file extent + * @param extent target file extent * - * @param[out] n_chunks number of output chunk locations - * @param[out] chunks array of output chunk locations + * @param[out] n_chunks number of output chunk locations + * @param[out] chunks array of output chunk locations * @param[out] full_coverage set to 1 if chunks fully cover extent * * @return UNIFYFS_SUCCESS, or error code */ -int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, +int unifyfs_inode_get_extent_chunks(unifyfs_extent_t* extent, unsigned int* n_chunks, chunk_read_req_t** chunks, int* full_coverage); @@ -192,7 +195,7 @@ int unifyfs_inode_get_extent_chunks(unifyfs_inode_extent_t* extent, * @return UNIFYFS_SUCCESS, or error code */ int unifyfs_inode_resolve_extent_chunks(unsigned int n_extents, - unifyfs_inode_extent_t* extents, + unifyfs_extent_t* extents, unsigned int* n_locs, chunk_read_req_t** chunklocs, int* full_coverage); diff --git a/server/src/unifyfs_p2p_rpc.c b/server/src/unifyfs_p2p_rpc.c index 1e3172706..95ff050cf 100644 --- a/server/src/unifyfs_p2p_rpc.c +++ b/server/src/unifyfs_p2p_rpc.c @@ -44,8 +44,8 @@ int get_p2p_request_handle(hg_id_t request_hgid, hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, req->peer, request_hgid, &(req->handle)); if (hret != HG_SUCCESS) { - LOGERR("failed to get handle for p2p request(%p) to server %d", - req, peer_rank); + LOGERR("failed to get handle for p2p request(%p) to server %d - %s", + req, peer_rank, HG_Error_to_string(hret)); rc = UNIFYFS_ERROR_MARGO; } @@ -59,10 +59,12 @@ int forward_p2p_request(void* input_ptr, int rc = UNIFYFS_SUCCESS; /* call rpc function */ - hg_return_t hret = margo_iforward(req->handle, input_ptr, - &(req->request)); + double timeout_ms = UNIFYFS_MARGO_SERVER_SERVER_TIMEOUT_MSEC; + hg_return_t hret = margo_iforward_timed(req->handle, input_ptr, + timeout_ms, &(req->request)); if (hret != HG_SUCCESS) { - LOGERR("failed to forward p2p request(%p)", req); + LOGERR("failed to forward p2p request(%p) - %s", + req, HG_Error_to_string(hret)); rc = UNIFYFS_ERROR_MARGO; } @@ -77,7 +79,8 @@ int wait_for_p2p_request(p2p_request* req) /* call rpc function */ hg_return_t hret = margo_wait(req->request); if (hret != HG_SUCCESS) { - LOGERR("wait on p2p request(%p) failed", req); + LOGERR("wait on p2p request(%p) failed - %s", + req, HG_Error_to_string(hret)); rc = UNIFYFS_ERROR_MARGO; } @@ -107,35 +110,26 @@ int invoke_chunk_read_request_rpc(int dst_srvr_rank, } int ret = UNIFYFS_SUCCESS; - hg_handle_t handle; chunk_read_request_in_t in; chunk_read_request_out_t out; hg_return_t hret; - hg_addr_t dst_srvr_addr; hg_size_t bulk_sz = (hg_size_t)num_chunks * sizeof(chunk_read_req_t); - assert(dst_srvr_rank < (int)glb_num_servers); - dst_srvr_addr = get_margo_server_address(dst_srvr_rank); - if (HG_ADDR_NULL == dst_srvr_addr) { - LOGERR("missing margo address for rank=%d", dst_srvr_rank); - return UNIFYFS_ERROR_MARGO; - } - - hret = margo_create(unifyfsd_rpc_context->svr_mid, dst_srvr_addr, - unifyfsd_rpc_context->rpcs.chunk_read_request_id, - &handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_create() failed"); - return UNIFYFS_ERROR_MARGO; + /* forward request to file owner */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.chunk_read_request_id; + int rc = get_p2p_request_handle(req_hgid, dst_srvr_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; } - /* fill in input struct */ - in.src_rank = (int32_t)glb_pmi_rank; - in.app_id = (int32_t)rdreq->app_id; - in.client_id = (int32_t)rdreq->client_id; - in.req_id = (int32_t)rdreq->req_ndx; - in.num_chks = (int32_t)num_chunks; - in.total_data_size = (hg_size_t)remote_reads->total_sz; + /* fill input struct */ + in.src_rank = (int32_t) glb_pmi_rank; + in.app_id = (int32_t) rdreq->app_id; + in.client_id = (int32_t) rdreq->client_id; + in.req_id = (int32_t) rdreq->req_ndx; + in.num_chks = (int32_t) num_chunks; + in.total_data_size = (hg_size_t) remote_reads->total_sz; in.bulk_size = bulk_sz; /* register request buffer for bulk remote access */ @@ -144,31 +138,41 @@ int invoke_chunk_read_request_rpc(int dst_srvr_rank, &data_buf, &bulk_sz, HG_BULK_READ_ONLY, &in.bulk_handle); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_bulk_create() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { LOGDBG("invoking the chunk-read-request rpc function"); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); - ret = UNIFYFS_ERROR_MARGO; + rc = forward_p2p_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("forward of chunk-read request rpc to server[%d] failed", + dst_srvr_rank); + margo_bulk_free(in.bulk_handle); + margo_destroy(preq.handle); + return UNIFYFS_ERROR_MARGO; + } + + /* wait for request completion */ + rc = wait_for_p2p_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; } else { /* decode response */ - hret = margo_get_output(handle, &out); + hret = margo_get_output(preq.handle, &out); if (hret == HG_SUCCESS) { ret = (int)out.ret; - LOGDBG("Got request rpc response from %d - ret=%d", - dst_srvr_rank, ret); - margo_free_output(handle, &out); + LOGDBG("Got chunk-read response from server[%d] - ret=%d", + dst_srvr_rank, ret); + margo_free_output(preq.handle, &out); } else { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", + HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } } margo_bulk_free(in.bulk_handle); } - margo_destroy(handle); + margo_destroy(preq.handle); return ret; } @@ -249,84 +253,78 @@ DEFINE_MARGO_RPC_HANDLER(chunk_read_request_rpc) int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr) { /* assume we'll succeed */ - int rc = UNIFYFS_SUCCESS; + int ret = UNIFYFS_SUCCESS; /* rank of destination server */ int dst_rank = scr->rank; assert(dst_rank < (int)glb_num_servers); - /* get address of destinaton server */ - hg_addr_t dst_addr = get_margo_server_address(dst_rank); - if (HG_ADDR_NULL == dst_addr) { - LOGERR("missing margo address for rank=%d", dst_rank); - return UNIFYFS_ERROR_MARGO; - } - - /* pointer to struct containing rpc context info, - * shorter name for convience */ - ServerRpcContext_t* ctx = unifyfsd_rpc_context; - - /* get handle to read response rpc on destination server */ - hg_handle_t handle; - hg_id_t resp_id = ctx->rpcs.chunk_read_response_id; - hg_return_t hret = margo_create(ctx->svr_mid, dst_addr, - resp_id, &handle); - if (hret != HG_SUCCESS) { - LOGERR("margo_create() failed"); - return UNIFYFS_ERROR_MARGO; + /* forward response to requesting server */ + p2p_request preq; + hg_id_t req_hgid = unifyfsd_rpc_context->rpcs.chunk_read_response_id; + int rc = get_p2p_request_handle(req_hgid, dst_rank, &preq); + if (rc != UNIFYFS_SUCCESS) { + return rc; } /* get address and size of our response buffer */ - void* data_buf = (void*)scr->resp; + void* data_buf = (void*) scr->resp; hg_size_t bulk_sz = scr->total_sz; /* register our response buffer for bulk remote read access */ chunk_read_response_in_t in; - hret = margo_bulk_create(ctx->svr_mid, 1, &data_buf, &bulk_sz, - HG_BULK_READ_ONLY, &in.bulk_handle); + hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, + 1, &data_buf, &bulk_sz, + HG_BULK_READ_ONLY, &in.bulk_handle); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_bulk_create() failed - %s", HG_Error_to_string(hret)); + margo_destroy(preq.handle); return UNIFYFS_ERROR_MARGO; } - /* fill in input struct */ - in.src_rank = (int32_t)glb_pmi_rank; - in.app_id = (int32_t)scr->app_id; - in.client_id = (int32_t)scr->client_id; - in.req_id = (int32_t)scr->rdreq_id; - in.num_chks = (int32_t)scr->num_chunks; + /* fill input struct */ + in.src_rank = (int32_t) glb_pmi_rank; + in.app_id = (int32_t) scr->app_id; + in.client_id = (int32_t) scr->client_id; + in.req_id = (int32_t) scr->rdreq_id; + in.num_chks = (int32_t) scr->num_chunks; in.bulk_size = bulk_sz; /* call the read response rpc */ LOGDBG("invoking the chunk-read-response rpc function"); - hret = margo_forward(handle, &in); - if (hret != HG_SUCCESS) { - LOGERR("margo_forward() failed"); - rc = UNIFYFS_ERROR_MARGO; + rc = forward_p2p_request((void*)&in, &preq); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; } else { - /* rpc executed, now decode response */ - chunk_read_response_out_t out; - hret = margo_get_output(handle, &out); - if (hret == HG_SUCCESS) { - rc = (int)out.ret; - LOGDBG("chunk-read-response rpc to server[%d] - ret=%d", - dst_rank, rc); - margo_free_output(handle, &out); + rc = wait_for_p2p_request(&preq); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; } else { - LOGERR("margo_get_output() failed"); - rc = UNIFYFS_ERROR_MARGO; + /* rpc executed, now decode response */ + chunk_read_response_out_t out; + hret = margo_get_output(preq.handle, &out); + if (hret == HG_SUCCESS) { + ret = (int)out.ret; + LOGDBG("chunk-read-response rpc to server[%d] - ret=%d", + dst_rank, rc); + margo_free_output(preq.handle, &out); + } else { + LOGERR("margo_get_output() failed - %s", + HG_Error_to_string(hret)); + ret = UNIFYFS_ERROR_MARGO; + } } } /* free resources allocated for executing margo rpc */ margo_bulk_free(in.bulk_handle); - margo_destroy(handle); + margo_destroy(preq.handle); /* free response data buffer */ free(data_buf); scr->resp = NULL; - return rc; + return ret; } /* handler for server-server chunk read response */ @@ -413,7 +411,7 @@ DEFINE_MARGO_RPC_HANDLER(chunk_read_response_rpc) /* Add extents to target file */ int unifyfs_invoke_add_extents_rpc(int gfid, unsigned int num_extents, - struct extent_tree_node* extents) + extent_metadata* extents) { int owner_rank = hash_gfid_to_server(gfid); if (owner_rank == glb_pmi_rank) { @@ -432,45 +430,50 @@ int unifyfs_invoke_add_extents_rpc(int gfid, /* create a margo bulk transfer handle for extents array */ hg_bulk_t bulk_handle; void* buf = (void*) extents; - size_t buf_sz = (size_t)num_extents * sizeof(struct extent_tree_node); + size_t buf_sz = (size_t)num_extents * sizeof(extent_metadata); hg_return_t hret = margo_bulk_create(unifyfsd_rpc_context->svr_mid, 1, &buf, &buf_sz, HG_BULK_READ_ONLY, &bulk_handle); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_bulk_create() failed - %s", HG_Error_to_string(hret)); + margo_destroy(preq.handle); return UNIFYFS_ERROR_MARGO; } /* fill rpc input struct and forward request */ add_extents_in_t in; - in.src_rank = (int32_t) glb_pmi_rank; - in.gfid = (int32_t) gfid; + in.src_rank = (int32_t) glb_pmi_rank; + in.gfid = (int32_t) gfid; in.num_extents = (int32_t) num_extents; - in.extents = bulk_handle; + in.extents = bulk_handle; + LOGDBG("forwarding add_extents(gfid=%d) to server[%d]", gfid, owner_rank); rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_bulk_free(bulk_handle); + margo_destroy(preq.handle); return rc; } - margo_bulk_free(bulk_handle); /* wait for request completion */ + int ret = UNIFYFS_SUCCESS; rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { - return rc; - } - - /* get the output of the rpc */ - int ret; - add_extents_out_t out; - hret = margo_get_output(preq.handle, &out); - if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); - ret = UNIFYFS_ERROR_MARGO; + ret = rc; } else { - /* set return value */ - ret = out.ret; - margo_free_output(preq.handle, &out); + /* get the output of the rpc */ + add_extents_out_t out; + hret = margo_get_output(preq.handle, &out); + if (hret != HG_SUCCESS) { + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); + ret = UNIFYFS_ERROR_MARGO; + } else { + /* set return value */ + ret = out.ret; + margo_free_output(preq.handle, &out); + } } + + margo_bulk_free(bulk_handle); margo_destroy(preq.handle); return ret; @@ -495,7 +498,7 @@ static void add_extents_rpc(hg_handle_t handle) ret = UNIFYFS_ERROR_MARGO; } else { size_t num_extents = (size_t) in->num_extents; - size_t bulk_sz = num_extents * sizeof(struct extent_tree_node); + size_t bulk_sz = num_extents * sizeof(extent_metadata); /* allocate memory for extents */ void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, @@ -551,7 +554,7 @@ DEFINE_MARGO_RPC_HANDLER(add_extents_rpc) /* Lookup extent locations for target file */ int unifyfs_invoke_find_extents_rpc(int gfid, unsigned int num_extents, - unifyfs_inode_extent_t* extents, + unifyfs_extent_t* extents, unsigned int* num_chunks, chunk_read_req_t** chunks) { @@ -611,22 +614,24 @@ int unifyfs_invoke_find_extents_rpc(int gfid, /* create a margo bulk transfer handle for extents array */ hg_bulk_t bulk_req_handle; void* buf = (void*) extents; - size_t buf_sz = (size_t)num_extents * sizeof(unifyfs_inode_extent_t); + size_t buf_sz = (size_t)num_extents * sizeof(unifyfs_extent_t); hg_return_t hret = margo_bulk_create(mid, 1, &buf, &buf_sz, HG_BULK_READ_ONLY, &bulk_req_handle); if (hret != HG_SUCCESS) { - LOGERR("margo_bulk_create() failed"); + LOGERR("margo_bulk_create() failed - %s", HG_Error_to_string(hret)); + margo_destroy(preq.handle); return UNIFYFS_ERROR_MARGO; } /* fill rpc input struct and forward request */ find_extents_in_t in; - in.src_rank = (int32_t) glb_pmi_rank; - in.gfid = (int32_t) gfid; + in.src_rank = (int32_t) glb_pmi_rank; + in.gfid = (int32_t) gfid; in.num_extents = (int32_t) num_extents; - in.extents = bulk_req_handle; + in.extents = bulk_req_handle; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } margo_bulk_free(bulk_req_handle); @@ -634,6 +639,7 @@ int unifyfs_invoke_find_extents_rpc(int gfid, /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -641,7 +647,7 @@ int unifyfs_invoke_find_extents_rpc(int gfid, find_extents_out_t out; hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ @@ -652,8 +658,8 @@ int unifyfs_invoke_find_extents_rpc(int gfid, if (n_chks > 0) { /* get bulk buffer with chunk locations */ buf_sz = (size_t)n_chks * sizeof(chunk_read_req_t); - buf = pull_margo_bulk_buffer(preq.handle, out.locations, buf_sz, - NULL); + buf = pull_margo_bulk_buffer(preq.handle, out.locations, + buf_sz, NULL); if (NULL == buf) { LOGERR("failed to get bulk chunk locations"); ret = UNIFYFS_ERROR_MARGO; @@ -692,7 +698,7 @@ static void find_extents_rpc(hg_handle_t handle) ret = UNIFYFS_ERROR_MARGO; } else { size_t num_extents = (size_t) in->num_extents; - size_t bulk_sz = num_extents * sizeof(unifyfs_inode_extent_t); + size_t bulk_sz = num_extents * sizeof(unifyfs_extent_t); /* allocate memory for extents */ void* extents_buf = pull_margo_bulk_buffer(handle, in->extents, @@ -778,7 +784,7 @@ int unifyfs_invoke_metaget_rpc(int gfid, LOGINFO("using cached attributes for gfid=%d", gfid); return UNIFYFS_SUCCESS; } else { - LOGINFO("cached attributes have expired"); + LOGINFO("cached attributes for gfid=%d have expired", gfid); } } else if (rc == ENOENT) { /* metaget above failed with ENOENT, need to create inode */ @@ -795,15 +801,17 @@ int unifyfs_invoke_metaget_rpc(int gfid, /* fill rpc input struct and forward request */ metaget_in_t in; - in.gfid = (int32_t)gfid; + in.gfid = (int32_t) gfid; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -812,7 +820,7 @@ int unifyfs_invoke_metaget_rpc(int gfid, metaget_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ @@ -928,12 +936,14 @@ int unifyfs_invoke_filesize_rpc(int gfid, in.gfid = (int32_t)gfid; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -942,7 +952,7 @@ int unifyfs_invoke_filesize_rpc(int gfid, filesize_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ @@ -1046,17 +1056,19 @@ int unifyfs_invoke_metaset_rpc(int gfid, /* fill rpc input struct and forward request */ metaset_in_t in; - in.gfid = (int32_t) gfid; + in.gfid = (int32_t) gfid; in.fileop = (int32_t) attr_op; - in.attr = *attrs; + in.attr = *attrs; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -1064,7 +1076,7 @@ int unifyfs_invoke_metaset_rpc(int gfid, metaset_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ @@ -1154,15 +1166,17 @@ int unifyfs_invoke_laminate_rpc(int gfid) /* fill rpc input struct and forward request */ laminate_in_t in; - in.gfid = (int32_t)gfid; + in.gfid = (int32_t) gfid; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -1170,7 +1184,7 @@ int unifyfs_invoke_laminate_rpc(int gfid) laminate_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ @@ -1273,12 +1287,14 @@ int unifyfs_invoke_transfer_rpc(int client_app, in.dst_file = (hg_const_string_t) dest_file; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -1287,7 +1303,7 @@ int unifyfs_invoke_transfer_rpc(int client_app, transfer_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ @@ -1376,16 +1392,18 @@ int unifyfs_invoke_truncate_rpc(int gfid, /* fill rpc input struct and forward request */ truncate_in_t in; - in.gfid = (int32_t) gfid; + in.gfid = (int32_t) gfid; in.filesize = (hg_size_t) filesize; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -1394,7 +1412,7 @@ int unifyfs_invoke_truncate_rpc(int gfid, truncate_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ @@ -1479,12 +1497,14 @@ int unifyfs_invoke_server_pid_rpc(void) in.pid = server_pid; rc = forward_p2p_request((void*)&in, &preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } /* wait for request completion */ rc = wait_for_p2p_request(&preq); if (rc != UNIFYFS_SUCCESS) { + margo_destroy(preq.handle); return rc; } @@ -1493,7 +1513,7 @@ int unifyfs_invoke_server_pid_rpc(void) server_pid_out_t out; hg_return_t hret = margo_get_output(preq.handle, &out); if (hret != HG_SUCCESS) { - LOGERR("margo_get_output() failed"); + LOGERR("margo_get_output() failed - %s", HG_Error_to_string(hret)); ret = UNIFYFS_ERROR_MARGO; } else { /* set return value */ diff --git a/server/src/unifyfs_p2p_rpc.h b/server/src/unifyfs_p2p_rpc.h index d8e73d8ec..50e6a41c0 100644 --- a/server/src/unifyfs_p2p_rpc.h +++ b/server/src/unifyfs_p2p_rpc.h @@ -79,7 +79,7 @@ int invoke_chunk_read_response_rpc(server_chunk_reads_t* scr); */ int unifyfs_invoke_add_extents_rpc(int gfid, unsigned int num_extents, - struct extent_tree_node* extents); + extent_metadata* extents); /** * @brief Find location of extents for target file @@ -95,7 +95,7 @@ int unifyfs_invoke_add_extents_rpc(int gfid, */ int unifyfs_invoke_find_extents_rpc(int gfid, unsigned int num_extents, - unifyfs_inode_extent_t* extents, + unifyfs_extent_t* extents, unsigned int* num_chunks, chunk_read_req_t** chunks); diff --git a/server/src/unifyfs_request_manager.h b/server/src/unifyfs_request_manager.h index 0bfba4079..09e37fba9 100644 --- a/server/src/unifyfs_request_manager.h +++ b/server/src/unifyfs_request_manager.h @@ -61,7 +61,7 @@ typedef struct { int num_server_reads; /* size of remote_reads array */ chunk_read_req_t* chunks; /* array of chunk-reads */ server_chunk_reads_t* remote_reads; /* per-server remote reads array */ - unifyfs_inode_extent_t extent; /* the requested extent */ + unifyfs_extent_t extent; /* the requested extent */ } server_read_req_t; /* Request manager state structure - created by main thread for each request diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index 3d29630e1..eb3c55bb3 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -282,6 +282,8 @@ int main(int argc, char* argv[]) { int rc; int kv_rank, kv_nranks; + long l; + bool b; bool daemon = true; struct sigaction sa; char dbg_fname[UNIFYFS_MAX_FILENAME] = {0}; @@ -304,7 +306,6 @@ int main(int argc, char* argv[]) server_pid = getpid(); if (server_cfg.log_verbosity != NULL) { - long l; rc = configurator_int_val(server_cfg.log_verbosity, &l); if (0 == rc) { unifyfs_set_log_level((unifyfs_log_level_t)l); @@ -337,7 +338,6 @@ int main(int argc, char* argv[]) // update clients_per_app based on configuration if (server_cfg.server_max_app_clients != NULL) { - long l; rc = configurator_int_val(server_cfg.server_max_app_clients, &l); if (0 == rc) { clients_per_app = l; @@ -414,14 +414,31 @@ int main(int argc, char* argv[]) glb_pmi_size = kv_nranks; } - LOGDBG("initializing rpc service"); - rc = configurator_bool_val(server_cfg.margo_lazy_connect, - &margo_lazy_connect); - rc = configurator_bool_val(server_cfg.margo_tcp, - &margo_use_tcp); + LOGDBG("initializing RPC service"); + + rc = configurator_int_val(server_cfg.margo_client_pool_size, &l); + if (0 == rc) { + margo_client_server_pool_sz = l; + } + + rc = configurator_int_val(server_cfg.margo_server_pool_size, &l); + if (0 == rc) { + margo_server_server_pool_sz = l; + } + + rc = configurator_bool_val(server_cfg.margo_lazy_connect, &b); + if (0 == rc) { + margo_lazy_connect = b; + } + + rc = configurator_bool_val(server_cfg.margo_tcp, &b); + if (0 == rc) { + margo_use_tcp = b; + } + rc = margo_server_rpc_init(); if (rc != UNIFYFS_SUCCESS) { - LOGERR("%s", unifyfs_rc_enum_description(rc)); + LOGERR("RPC init failed - %s", unifyfs_rc_enum_description(rc)); exit(1); } diff --git a/server/src/unifyfs_service_manager.c b/server/src/unifyfs_service_manager.c index 4e319f014..06073c3ee 100644 --- a/server/src/unifyfs_service_manager.c +++ b/server/src/unifyfs_service_manager.c @@ -504,7 +504,7 @@ int sm_set_fileattr(int gfid, int sm_add_extents(int gfid, size_t num_extents, - struct extent_tree_node* extents) + extent_metadata* extents) { int owner_rank = hash_gfid_to_server(gfid); int is_owner = (owner_rank == glb_pmi_rank); @@ -520,7 +520,7 @@ int sm_add_extents(int gfid, int sm_find_extents(int gfid, size_t num_extents, - unifyfs_inode_extent_t* extents, + unifyfs_extent_t* extents, unsigned int* out_num_chunks, chunk_read_req_t** out_chunks, int* full_coverage) @@ -530,14 +530,13 @@ int sm_find_extents(int gfid, int ret = unifyfs_inode_metaget(gfid, &attrs); if (ret == UNIFYFS_SUCCESS) { /* do inode extent lookup */ - unsigned int n_extents = (unsigned int)num_extents; + unsigned int n_extents = (unsigned int) num_extents; ret = unifyfs_inode_resolve_extent_chunks(n_extents, extents, out_num_chunks, out_chunks, full_coverage); if (ret) { - LOGERR("failed to find extents for gfid=%d (rc=%d)", - gfid, ret); + LOGERR("failed to find extents for gfid=%d (rc=%d)", gfid, ret); } else if (*out_num_chunks == 0) { LOGDBG("extent lookup for gfid=%d found no matching chunks", gfid); } @@ -813,7 +812,7 @@ static int process_add_extents_rpc(server_rpc_req_t* req) int sender = (int) in->src_rank; int gfid = (int) in->gfid; size_t num_extents = (size_t) in->num_extents; - struct extent_tree_node* extents = req->bulk_buf; + extent_metadata* extents = req->bulk_buf; /* add extents */ LOGDBG("adding %zu extents to gfid=%d from server[%d]", @@ -848,7 +847,7 @@ static int process_find_extents_rpc(server_rpc_req_t* req) int sender = (int) in->src_rank; int gfid = (int) in->gfid; size_t num_extents = (size_t) in->num_extents; - unifyfs_inode_extent_t* extents = req->bulk_buf; + unifyfs_extent_t* extents = req->bulk_buf; LOGDBG("received %zu extent lookups for gfid=%d from server[%d]", num_extents, gfid, sender); @@ -1105,7 +1104,7 @@ static int process_extents_bcast_rpc(server_rpc_req_t* req) extent_bcast_in_t* in = req->input; int gfid = (int) in->gfid; size_t num_extents = (size_t) in->num_extents; - struct extent_tree_node* extents = req->bulk_buf; + extent_metadata* extents = req->bulk_buf; LOGDBG("gfid=%d num_extents=%zu", gfid, num_extents); @@ -1153,7 +1152,7 @@ static int process_laminate_bcast_rpc(server_rpc_req_t* req) int gfid = (int) in->gfid; size_t num_extents = (size_t) in->num_extents; unifyfs_file_attr_t* fattr = &(in->attr); - struct extent_tree_node* extents = req->bulk_buf; + extent_metadata* extents = req->bulk_buf; LOGDBG("gfid=%d num_extents=%zu", gfid, num_extents); diff --git a/server/src/unifyfs_service_manager.h b/server/src/unifyfs_service_manager.h index f15e3ece0..880567048 100644 --- a/server/src/unifyfs_service_manager.h +++ b/server/src/unifyfs_service_manager.h @@ -80,11 +80,11 @@ int sm_set_fileattr(int gfid, int sm_add_extents(int gfid, size_t num_extents, - struct extent_tree_node* extents); + extent_metadata* extents); int sm_find_extents(int gfid, size_t num_extents, - unifyfs_inode_extent_t* extents, + unifyfs_extent_t* extents, unsigned int* out_num_chunks, chunk_read_req_t** out_chunks, int* full_coverage); diff --git a/server/src/unifyfs_transfer.c b/server/src/unifyfs_transfer.c index c0b70c533..2c117970e 100644 --- a/server/src/unifyfs_transfer.c +++ b/server/src/unifyfs_transfer.c @@ -65,20 +65,20 @@ static int write_transfer_chunk(int fd, return UNIFYFS_SUCCESS; } -static int read_local_extent(struct extent_tree_node* ext, +static int read_local_extent(extent_metadata* ext, transfer_chunk* chk) { int ret = UNIFYFS_SUCCESS; char* buf = chk->chunk_data; - chk->chunk_sz = extent_tree_node_length(ext); - chk->file_offset = extent_tree_node_offset(ext); + chk->chunk_sz = extent_length(ext); + chk->file_offset = extent_offset(ext); /* read data from client log */ app_client* app_clnt = NULL; int app_id = ext->app_id; int cli_id = ext->cli_id; - off_t log_offset = (off_t) ext->pos; + off_t log_offset = (off_t) ext->log_pos; app_clnt = get_app_client(app_id, cli_id); if (NULL != app_clnt) { logio_context* logio_ctx = app_clnt->state.logio_ctx; @@ -114,7 +114,7 @@ int create_local_transfers(int gfid, } size_t n_extents = 0; - struct extent_tree_node* extents = NULL; + extent_metadata* extents = NULL; int rc = unifyfs_inode_get_extents(gfid, &n_extents, &extents); if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to get extents from inode for gfid=%d", gfid); @@ -124,27 +124,27 @@ int create_local_transfers(int gfid, } /* determine local extents */ - struct extent_tree_node* ext; + extent_metadata* ext; size_t n_local_extents = 0; size_t total_local_data_sz = 0; for (size_t i = 0; i < n_extents; i++) { ext = extents + i; if (glb_pmi_rank == ext->svr_rank) { - total_local_data_sz += extent_tree_node_length(ext); + total_local_data_sz += extent_length(ext); n_local_extents++; } } /* make an array of local extents */ - struct extent_tree_node* local_extents = (struct extent_tree_node*) - calloc(n_local_extents, sizeof(struct extent_tree_node)); + extent_metadata* local_extents = (extent_metadata*) + calloc(n_local_extents, sizeof(extent_metadata)); if (NULL == local_extents) { LOGERR("failed to allocate local extents for gfid=%d", gfid); free(extents); return ENOMEM; } - struct extent_tree_node* dst_ext; + extent_metadata* dst_ext; size_t ext_ndx = 0; for (size_t i = 0; i < n_extents; i++) { ext = extents + i; @@ -188,7 +188,7 @@ void* transfer_helper_thread(void* arg) int ret = UNIFYFS_SUCCESS; char* data_copy_buf = NULL; transfer_chunk* chunks = NULL; - struct extent_tree_node* ext; + extent_metadata* ext; transfer_chunk* chk; LOGDBG("I am transfer thread for gfid=%d file=%s", @@ -226,7 +226,7 @@ void* transfer_helper_thread(void* arg) /* make sure longest extent will fit in copy buffer */ for (size_t i = 0; i < n_extents; i++) { ext = tta->local_extents + i; - size_t ext_sz = extent_tree_node_length(ext); + size_t ext_sz = extent_length(ext); if (ext_sz > buf_sz) { buf_sz = ext_sz; } @@ -248,7 +248,7 @@ void* transfer_helper_thread(void* arg) size_t copy_sz = 0; for (size_t i = ext_ndx; i < n_extents; i++) { ext = tta->local_extents + i; - size_t ext_sz = extent_tree_node_length(ext); + size_t ext_sz = extent_length(ext); if ((copy_sz + ext_sz) <= buf_sz) { chk = chunks + chk_ndx; chk_ndx++; diff --git a/server/src/unifyfs_transfer.h b/server/src/unifyfs_transfer.h index 6dfca0c80..91cb65390 100644 --- a/server/src/unifyfs_transfer.h +++ b/server/src/unifyfs_transfer.h @@ -35,7 +35,7 @@ typedef struct transfer_thread_args { int transfer_id; /* transfer request id at originating client */ /* local extents to transfer to destination file */ - struct extent_tree_node* local_extents; + extent_metadata* local_extents; size_t n_extents; size_t local_data_sz; /* total size of local data */ From 519e3cf4f3cd992fba5d6b74945aebcc6508d900 Mon Sep 17 00:00:00 2001 From: CamStan Date: Mon, 31 Jan 2022 18:18:29 -0800 Subject: [PATCH 58/81] Set up GitHub Actions Migrate from using Travis CI to GitHub Actions. Will trigger on all branch pushes and pull requests to the dev branch. Runs checkpatch. Tests installing dependencies with Spack, building UnifyFS, and running the unit tests across multiple GCC compiler versions (7, 8, 9, 10, 11). --- .github/workflows/build-and-test.yml | 140 ++++++++++++++++++++++++ .travis.yml | 155 --------------------------- README.md | 3 +- docs/testing.rst | 6 +- t/sharness.d/00-test-env.sh | 4 +- 5 files changed, 147 insertions(+), 161 deletions(-) create mode 100644 .github/workflows/build-and-test.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml new file mode 100644 index 000000000..e3de511a7 --- /dev/null +++ b/.github/workflows/build-and-test.yml @@ -0,0 +1,140 @@ +name: UnifyFS Build and Test + +on: + pull_request: + branches: [ dev ] + push: + +jobs: + checkpatch: + runs-on: [ ubuntu-latest ] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 2 + + - name: Checkpatch + run: | + eval $(git log HEAD^..HEAD | sed "s/^ *//g" | grep '^TEST_.*=') + export TEST_CHECKPATCH_SKIP_FILES + (git diff HEAD^..HEAD | ./scripts/checkpatch.sh origin/dev..HEAD) || test "$TEST_CHECKPATCH_ALLOW_FAILURE" = yes + + build-and-test: + strategy: + fail-fast: false + matrix: + os: [ ubuntu-latest ] + compiler: [ gcc ] + gcc: [ 7, 8, 9, 10, 11 ] + + name: ${{ matrix.os }}-${{ matrix.compiler }}-${{ matrix.gcc }} + + runs-on: ${{ matrix.os }} + + env: + CC: gcc-${{ matrix.gcc }} + CXX: g++-${{ matrix.gcc }} + + steps: + - uses: actions/checkout@v2 + + - name: Set up GCC + uses: egor-tensin/setup-gcc@v1 + with: + version: ${{ matrix.gcc }} + + - name: Install additional packages + run: | + sudo apt-get update + sudo apt-get install libtool-bin + sudo apt-get install openmpi-bin + sudo apt-get install libopenmpi-dev + + - name: Install Spack + uses: kzscisoft/install-spack@v1 + + - name: Set up packages.yaml + run: | + test -f $GITHUB_WORKSPACE/.spack/etc/spack/packages.yaml || cat > $GITHUB_WORKSPACE/.spack/etc/spack/packages.yaml << 'EOF' + packages: + all: + target: [x86_64] + providers: + mpi: [openmpi] + autoconf: + buildable: False + externals: + - spec: "autoconf@2.69" + prefix: /usr + automake: + buildable: False + externals: + - spec: "automake@1.16.1" + prefix: /usr + cmake: + buildable: False + externals: + - spec: "cmake@3.22.1" + prefix: /usr + libtool: + buildable: False + externals: + - spec: "libtool@2.4.6" + prefix: /usr + m4: + buildable: False + externals: + - spec: "m4@1.4.18" + prefix: /usr + openmpi: + buildable: False + externals: + - spec: "openmpi@4.0.3" + prefix: /usr + pkg-config: + buildable: False + externals: + - spec: "pkg-config@0.29.1" + prefix: /usr + EOF + spack compiler find --scope=user + if [[ $CC == 'gcc-7' ]]; then + spack config add "packages:all:compiler:[gcc@7.5.0]" + elif [[ $CC == 'gcc-8' ]]; then + spack config add "packages:all:compiler:[gcc@8.4.0]" + elif [[ $CC == 'gcc-9' ]]; then + spack config add "packages:all:compiler:[gcc@9.3.0]" + elif [[ $CC == 'gcc-11' ]]; then + spack config add "packages:all:compiler:[gcc@11.1.0]" + else + spack config add "packages:all:compiler:[gcc@10.3.0]" + fi + + - name: Install UnifyFS dependencies + run: | + spack install gotcha@1.0.3 + spack install mochi-margo@0.9.6 ^mercury~boostsys ^libfabric@1.12.1 fabrics=rxm,sockets,tcp + spack install spath~mpi + echo "GOTCHA_INSTALL=$(spack location -i gotcha)" >> $GITHUB_ENV + echo "SPATH_INSTALL=$(spack location -i spath)" >> $GITHUB_ENV + + - name: Configure and Build + run: | + source $GITHUB_WORKSPACE/.spack/share/spack/setup-env.sh + spack load gotcha && spack load argobots && spack load mercury && spack load mochi-margo && spack load spath + ./autogen.sh + ./configure CC=$CC --with-gotcha=$GOTCHA_INSTALL --with-spath=$SPATH_INSTALL --enable-fortran + make V=1 + + - name: Unit Tests + run: | + source $GITHUB_WORKSPACE/.spack/share/spack/setup-env.sh + spack load gotcha && spack load argobots && spack load mercury && spack load mochi-margo && spack load spath + cd t && make check + + - name: After failure + if: ${{ failure() }} + run: | + cat $GITHUB_WORKSPACE/config.log + cat $GITHUB_WORKSPACE/t/test-suite.log diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 7e5b8ac3a..000000000 --- a/.travis.yml +++ /dev/null @@ -1,155 +0,0 @@ -language: c -dist: bionic -os: linux - -addons: - apt: - packages: &native_deps - - autoconf - - autoconf-archive - - automake - - build-essential - - cmake - - diffutils - - gfortran - - libhdf5-openmpi-dev - - libopenmpi-dev - - libtool-bin - - m4 - - openmpi-bin - - pkg-config - -jobs: - include: - - name: "Checkpatch" - before_install: skip - install: skip - before_script: - - eval $(./scripts/git_log_test_env.sh) - - export TEST_CHECKPATCH_SKIP_FILES - script: ./scripts/checkpatch.sh || test "$TEST_CHECKPATCH_ALLOW_FAILURE" = yes - - - name: "GCC 4.9" - dist: xenial - addons: - apt: - update: true - sources: - - ubuntu-toolchain-r-test - packages: - - *native_deps - - g++-4.9 - env: - - MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9" - - - name: "GCC 7.5" - addons: - apt: - update: true - sources: - - ubuntu-toolchain-r-test - packages: - - *native_deps - - g++-7 - env: - - MATRIX_EVAL="CC=gcc-7 && CXX=g++-7" - - - name: "GCC 9.3" - addons: - apt: - update: true - sources: - - ubuntu-toolchain-r-test - packages: - - *native_deps - - g++-9 - env: - - MATRIX_EVAL="CC=gcc-9 && CXX=g++-9" - - - name: "GCC 10.1" - addons: - apt: - update: true - sources: - - ubuntu-toolchain-r-test - packages: - - *native_deps - - g++-10 - env: - - MATRIX_EVAL="CC=gcc-10 && CXX=g++-10" - -before_install: - - eval "${MATRIX_EVAL}" - - echo kernel.yama.ptrace_scope=0 | sudo tee -a /etc/sysctl.conf && sudo sysctl -p - - (cd $HOME/spack; git describe) || git clone https://github.com/spack/spack $HOME/spack - # Create packages.yaml to prevent building dependencies that time out - - | - test -f $HOME/spack/etc/spack/packages.yaml || cat > $HOME/spack/etc/spack/packages.yaml << ' EOF' - packages: - all: - target: [x86_64] - providers: - mpi: [openmpi] - autoconf: - buildable: False - externals: - - spec: "autoconf@2.69" - prefix: /usr - automake: - buildable: False - externals: - - spec: "automake@1.15.1" - prefix: /usr - cmake: - buildable: False - externals: - - spec: "cmake@3.12.4" - prefix: /usr/local/cmake-3.12.4 - libtool: - buildable: False - externals: - - spec: "libtool@2.4.6" - prefix: /usr - m4: - buildable: False - externals: - - spec: "m4@1.4.18" - prefix: /usr - openmpi: - buildable: False - externals: - - spec: "openmpi@2.1.1" - prefix: /usr - pkg-config: - buildable: False - externals: - - spec: "pkg-config@0.29.1" - prefix: /usr - EOF - -install: - - . $HOME/spack/share/spack/setup-env.sh - - spack install gotcha@1.0.3 && spack load gotcha@1.0.3 - - spack install mochi-margo@0.9.6 ^mercury~boostsys ^libfabric fabrics=rxm,sockets,tcp && spack load argobots && spack load mercury && spack load mochi-margo - - spack install spath~mpi && spack load spath - # prepare build environment - - GOTCHA_INSTALL=$(spack location -i gotcha) - - SPATH_INSTALL=$(spack location -i spath) - -script: - - export DISTCHECK_CONFIGURE_FLAGS="CC=$CC --with-gotcha=$GOTCHA_INSTALL --with-spath=$SPATH_INSTALL --enable-fortran" - - ./autogen.sh - - ./configure $DISTCHECK_CONFIGURE_FLAGS - - make distcheck V=1 - -cache: - directories: - - $HOME/spack - -# Don't update the cache just because the spack lock file changed -before_cache: - - rm -f $HOME/spack/opt/spack/.spack-db/prefix_lock - -after_failure: - - find . -type f -name "config.log" -execdir cat {} \; - - find . -type f -name "test-suite.log" -execdir cat {} \; diff --git a/README.md b/README.md index 0eb9e41eb..40686898e 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,8 @@ see [Build UnifyFS](http://unifyfs.readthedocs.io/en/dev/build.html). ## Build Status Status of UnifyFS development branch (dev): -[![Build Status](https://api.travis-ci.com/LLNL/UnifyFS.png?branch=dev)](https://travis-ci.com/LLNL/UnifyFS) +![Build Status](https://github.com/LLNL/UnifyFS/actions/workflows/build-and-test.yml/badge.svg?branch=dev) + [![Read the Docs](https://readthedocs.org/projects/unifyfs/badge/?version=dev)](https://unifyfs.readthedocs.io) ## Contribute and Develop diff --git a/docs/testing.rst b/docs/testing.rst index bd7b5dae2..362717922 100644 --- a/docs/testing.rst +++ b/docs/testing.rst @@ -335,7 +335,7 @@ normally be run first to start the UnifyFS daemon. E.g., to run just the ``make check`` The tests in https://github.com/LLNL/UnifyFS/tree/dev/t are run automatically -by `Travis CI`_ along with the :ref:`style checks ` when a +using `GitHub Actions`_ along with the :ref:`style checks ` when a pull request is created or updated. All pull requests must pass these tests before they will be accepted. @@ -349,7 +349,7 @@ Interpreting the Results After a test runs, its result is printed out consisting of its status followed by its description and potentially a TODO/SKIP message. Once all the tests -have completed (either from being run manually or by `Travis CI`_), the overall +have completed (either from being run manually or by `GitHub Actions`_), the overall results are printed out, as shown in the image on the right. @@ -1002,6 +1002,7 @@ comments in `t/ci/ci-functions.sh`_. .. explicit external hyperlink targets .. _Bamboo: https://www.atlassian.com/software/bamboo +.. _GitHub Actions: https://docs.github.com/en/actions .. _GitLab CI: https://about.gitlab.com .. _examples: https://github.com/LLNL/UnifyFS/tree/dev/examples/src .. _libtap library: https://github.com/zorgnax/libtap @@ -1021,4 +1022,3 @@ comments in `t/ci/ci-functions.sh`_. .. _t/ci/ci-functions.sh: https://github.com/LLNL/UnifyFS/blob/dev/t/ci/ci-functions.sh .. _t/ci/RUN_CI_TESTS.sh: https://github.com/LLNL/UnifyFS/blob/dev/t/ci/RUN_CI_TESTS.sh .. _Test Anything Protocol: https://testanything.org -.. _Travis CI: https://docs.travis-ci.com diff --git a/t/sharness.d/00-test-env.sh b/t/sharness.d/00-test-env.sh index fa91c9a9d..a61583511 100644 --- a/t/sharness.d/00-test-env.sh +++ b/t/sharness.d/00-test-env.sh @@ -56,10 +56,10 @@ if [ -f /proc/sys/kernel/yama/ptrace_scope ]; then scope_val=`cat /proc/sys/kernel/yama/ptrace_scope` if [ $scope_val -ne 0 ]; then if [ $UID -eq 0 ]; then - echo 0 > /proc/sys/kernel/yama/ptrace_scope 2>/dev/null || \ + echo 0 > /proc/sys/kernel/yama/ptrace_scope 2>/dev/null || echo >&2 "Failed to enable cross-memory attach for Mercury shmem" else - sudo echo 0 > /proc/sys/kernel/yama/ptrace_scope 2>/dev/null || \ + echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>/dev/null || echo >&2 "Failed to enable cross-memory attach for Mercury shmem" fi fi From de3178c5b3b56ebc6f4f9bc95df84b53a8f5f904 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Thu, 3 Feb 2022 17:24:19 -0500 Subject: [PATCH 59/81] fix for serial transfer from non-zero rank client --- client/src/posix_client.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/client/src/posix_client.c b/client/src/posix_client.c index a36d40dca..975f357ca 100644 --- a/client/src/posix_client.c +++ b/client/src/posix_client.c @@ -872,10 +872,8 @@ int unifyfs_transfer_file(const char* src, return -EINVAL; } - /* TODO: Fix parallel transfer logic - * for both serial and parallel transfers, use rank 0 client to - * create the destination file */ - if (0 == client_rank) { + /* create the destination file */ + if ((UNIFYFS_TRANSFER_SERIAL == mode) || (0 == client_rank)) { errno = 0; int create_flags = O_CREAT | O_WRONLY | O_TRUNC; int dst_mode; From 934d4c87aaa4c9693d1f39b755ab877a070113b4 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Thu, 24 Mar 2022 14:43:17 -0400 Subject: [PATCH 60/81] Install unifyfs-stage in bin The unifyfs-stage util is expected to be called by users. As such, it should be installed in the bin dir rather than libexec. --- util/unifyfs-stage/src/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/unifyfs-stage/src/Makefile.am b/util/unifyfs-stage/src/Makefile.am index 494909682..b8127ddbe 100644 --- a/util/unifyfs-stage/src/Makefile.am +++ b/util/unifyfs-stage/src/Makefile.am @@ -1,4 +1,4 @@ -libexec_PROGRAMS = unifyfs-stage +bin_PROGRAMS = unifyfs-stage noinst_HEADERS = unifyfs-stage.h From df382cc40f520dce2b482586259afd83f9d41c1e Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Thu, 24 Mar 2022 16:18:31 -0400 Subject: [PATCH 61/81] Update docs for unifyfs-stage Update the docs for unifyfs-stage to reflect its new install directory. --- docs/run.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/run.rst b/docs/run.rst index 0afac6973..06ad139bd 100644 --- a/docs/run.rst +++ b/docs/run.rst @@ -194,10 +194,9 @@ and/or transferring results out to be verified before the job terminates. UnifyFS Stage Executable ^^^^^^^^^^^^^^^^^^^^^^^^ -``$UNIFYFS_INSTALL/libexec/unifyfs-stage`` - The ``start``/``terminate`` transfer API stage functionality can also be used -via the stand-alone application ``unifyfs-stage``. +via the stand-alone application ``unifyfs-stage``. This application is installed +in the same directory as the ``unifyfs`` utility (``$UNIFYFS_INSTALL/bin``). This application can be run at any time within a job to transfer new data into or results out of UnifyFS. @@ -206,7 +205,7 @@ as an argument to use this approach. .. code-block:: Bash - [prompt]$ ./unifyfs-stage --help + [prompt]$ unifyfs-stage --help Usage: unifyfs-stage [OPTION]... @@ -241,12 +240,12 @@ Examples: .. code-block:: Bash :caption: Serial Transfer - $ srun -N 1 -n 1 $UNIFYFS_INSTALL/libexec/unifyfs-stage $MY_MANIFEST_FILE + $ srun -N 1 -n 1 unifyfs-stage $MY_MANIFEST_FILE .. code-block:: Bash :caption: Parallel Transfer - $ srun -N 4 -n 8 $UNIFYFS_INSTALL/libexec/unifyfs-stage --parallel $MY_MANIFEST_FILE + $ srun -N 4 -n 8 unifyfs-stage --parallel $MY_MANIFEST_FILE Transfer Executable ^^^^^^^^^^^^^^^^^^^ From 99a3cfe72993a42d3413b06aae119cef66f330af Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Thu, 31 Mar 2022 14:03:41 -0400 Subject: [PATCH 62/81] Change location of unifyfs-stage in test script Point the test script at the new location (ie: the bin dir) for the unifyfs-stage binary. --- t/ci/800-stage-tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/ci/800-stage-tests.sh b/t/ci/800-stage-tests.sh index c2308889a..9e6b72c7a 100755 --- a/t/ci/800-stage-tests.sh +++ b/t/ci/800-stage-tests.sh @@ -28,7 +28,7 @@ test_description="UnifyFS Stage-in+Stage-out tests" # utility checking to make sure everything's in place. -STAGE_EXE=${UNIFYFS_EXAMPLES}/unifyfs-stage +STAGE_EXE=${UNIFYFS_BIN}/unifyfs-stage test_expect_success "unify-stage executable exists" ' test_path_is_file $STAGE_EXE ' From 2334e8d3e195e3d0a1812d9d29fef4f03e332f3a Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Fri, 13 May 2022 14:37:25 -0400 Subject: [PATCH 63/81] Use PMIX_JOB_SIZE instead of PMIX_UNIV_SIZE The PMIX_UNIV_SIZE value is the number of allocated slots in the job whereas the PMIX_JOB_SIZE is the number of actual processes that have been started. When trying to calculate how many instances of unifyfsd to expect, PMIX_JOB_SIZE is the correct value. See issue LLNL/Unifyfs#707 --- common/src/unifyfs_keyval.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index 151661d72..9187215fb 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -368,17 +368,18 @@ int unifyfs_pmix_init(void) kv_max_keylen = PMIX_MAX_KEYLEN; kv_max_vallen = PMIX_MAX_VALLEN; - /* get PMIx universe size */ + /* get PMIx job size */ PMIX_PROC_CONSTRUCT(&proc); - (void)strncpy(proc.nspace, pmix_myproc.nspace, PMIX_MAX_NSLEN); + strlcpy(proc.nspace, pmix_myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; - rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &valp); + rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &valp); if (rc != PMIX_SUCCESS) { - LOGERR("PMIx rank %d: PMIx_Get(UNIV_SIZE) failed: %s", + LOGERR("PMIx rank %d: PMIx_Get(JOB_SIZE) failed: %s", pmix_myproc.rank, PMIx_Error_string(rc)); return (int)UNIFYFS_ERROR_PMI; } pmix_univ_nprocs = (size_t) valp->data.uint32; + LOGDBG("PMIX_JOB_SIZE: %d", pmix_univ_nprocs); kv_myrank = pmix_myproc.rank; kv_nranks = (int)pmix_univ_nprocs; @@ -447,7 +448,7 @@ static int unifyfs_pmix_lookup(const char* key, } /* set key to lookup */ - strncpy(pmix_key, key, sizeof(pmix_key)); + strlcpy(pmix_key, key, sizeof(pmix_key)); PMIX_PDATA_CREATE(pdata, 1); PMIX_PDATA_LOAD(&pdata[0], &pmix_myproc, pmix_key, NULL, PMIX_STRING); @@ -498,7 +499,7 @@ static int unifyfs_pmix_publish(const char* key, } /* set key-val and modify publish behavior */ - strncpy(pmix_key, key, sizeof(pmix_key)); + strlcpy(pmix_key, key, sizeof(pmix_key)); range = PMIX_RANGE_GLOBAL; ninfo = 2; PMIX_INFO_CREATE(info, ninfo); From 14955142bc814a884e6366c2e71dfe98be8025c7 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Tue, 24 May 2022 09:52:06 -0400 Subject: [PATCH 64/81] Trap errors in bootstrap.sh Use the bash 'trap' command to catch errors in any of the commands in the script. If an error is caught, print a message with the line number and abort the script. --- bootstrap.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bootstrap.sh b/bootstrap.sh index df1237ead..d354ba476 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -3,6 +3,13 @@ # This is an easy-bake script to download and build all UnifyFS's dependencies. # +# Abort the script if we have any errors +err_report() { + echo "!!!! Previous command failed! (Line number $1) Aborting. !!!!" + exit 1 +} +trap 'err_report $LINENO' ERR + make_nproc=4 ROOT="$(pwd)" From 968c2b1329f90ad46ff7e2a94acbc7bb0ad0e501 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Tue, 24 May 2022 10:04:22 -0400 Subject: [PATCH 65/81] Use spath:main branch in bootstrap.sh Change bootstrap.sh to checkout the "main" branch of the spath repo instead of "master" because "master" doesn't exist anymore. --- bootstrap.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bootstrap.sh b/bootstrap.sh index d354ba476..15d33bf17 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -252,7 +252,7 @@ cd .. echo "### building spath ###" cd spath -git checkout master +git checkout main mkdir -p build && cd build cmake -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" -DMPI=OFF .. make -j $make_nproc && make install From b391dfcc116ece0818aa93f2135f33f9197f8466 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Mon, 23 May 2022 14:22:24 -0400 Subject: [PATCH 66/81] improve support for file staging Client: * POSIX wrappers should set errno=0 on success Library API: * rename unifyfs_status to unifyfs_file_status * rename unifyfs_ioreq_state to unifyfs_req_state * add transfer-specific result structure unifyfs_transfer_result that includes size of transferred file and elapsed transfer time * expose is_unifyfs_path() utility function Examples & Tests: * updates for API changes Server: * fix bug when API transfer initiated from non-owner * ENOENT is ok for non-owner sm_transfer/truncate() * support new transfer API result fields (size, time) * enable real ULT concurrency for broadcast progress * fix case where transfer broadcast collective progressed redundantly unifyfs-stage: * fix path to executable in unifyfs utility, and use separate status files for stage-in/out * use -S for status file path rather than passing share-dir * new implementation that uses API instead of I/O interposition * improve verbose messages TEST_CHECKPATCH_SKIP_FILES="t/ci/800-stage-tests.sh" --- client/src/client_transfer.c | 37 +- client/src/client_transfer.h | 7 +- client/src/margo_client.c | 16 +- client/src/posix_client.c | 2 +- client/src/unifyfs-stdio.c | 59 +- client/src/unifyfs-sysio.c | 81 ++- client/src/unifyfs_api.h | 41 +- client/src/unifyfs_api_file.c | 9 +- client/src/unifyfs_api_internal.h | 4 - client/src/unifyfs_api_io.c | 24 +- client/src/unifyfs_api_transfer.c | 14 +- client/src/unifyfs_fid.c | 2 +- common/src/unifyfs_client_rpcs.h | 3 + common/src/unifyfs_misc.c | 43 +- common/src/unifyfs_misc.h | 17 +- docs/library_api.rst | 38 +- docs/run.rst | 115 +-- examples/src/testutil_rdwr.h | 2 +- server/src/margo_server.c | 14 +- server/src/margo_server.h | 3 + server/src/unifyfs_fops_rpc.c | 4 +- server/src/unifyfs_global.h | 6 + server/src/unifyfs_group_rpc.c | 124 ++-- server/src/unifyfs_group_rpc.h | 15 +- server/src/unifyfs_request_manager.c | 5 +- server/src/unifyfs_service_manager.c | 116 ++- server/src/unifyfs_service_manager.h | 2 +- server/src/unifyfs_transfer.c | 279 ++++---- server/src/unifyfs_transfer.h | 14 +- server/src/unifyfs_tree.c | 7 +- t/0700-unifyfs-stage-full.t | 4 +- t/9300-unifyfs-stage-isolated.t | 70 -- t/Makefile.am | 1 - t/api/laminate.c | 2 +- t/api/storage-reuse.c | 4 +- t/api/transfer.c | 2 +- t/api/write-read-sync-stat.c | 2 +- t/ci/800-stage-tests.sh | 163 ++--- util/unifyfs-stage/src/Makefile.am | 8 +- .../src/unifyfs-stage-transfer.c | 670 ++++++++++++------ util/unifyfs-stage/src/unifyfs-stage.c | 465 ++++++++---- util/unifyfs-stage/src/unifyfs-stage.h | 94 +-- util/unifyfs/src/unifyfs-rm.c | 203 +++--- util/unifyfs/src/unifyfs.c | 15 +- util/unifyfs/src/unifyfs.h | 11 +- 45 files changed, 1692 insertions(+), 1125 deletions(-) delete mode 100755 t/9300-unifyfs-stage-isolated.t diff --git a/client/src/client_transfer.c b/client/src/client_transfer.c index 99c530f87..127c18cd2 100644 --- a/client/src/client_transfer.c +++ b/client/src/client_transfer.c @@ -35,14 +35,14 @@ static const char* transfer_mode_str(unifyfs_transfer_mode mode) static const char* state_canceled_str = "CANCELED"; static const char* state_completed_str = "COMPLETED"; static const char* state_inprogress_str = "IN-PROGRESS"; -static const char* transfer_state_str(unifyfs_ioreq_state state) +static const char* transfer_state_str(unifyfs_req_state state) { switch (state) { - case UNIFYFS_IOREQ_STATE_IN_PROGRESS: + case UNIFYFS_REQ_STATE_IN_PROGRESS: return state_inprogress_str; - case UNIFYFS_IOREQ_STATE_CANCELED: + case UNIFYFS_REQ_STATE_CANCELED: return state_canceled_str; - case UNIFYFS_IOREQ_STATE_COMPLETED: + case UNIFYFS_REQ_STATE_COMPLETED: return state_completed_str; default: return invalid_str; @@ -174,13 +174,13 @@ bool client_check_transfer_complete(client_transfer_status* transfer) bool is_complete = false; switch (req->state) { - case UNIFYFS_IOREQ_STATE_IN_PROGRESS: + case UNIFYFS_REQ_STATE_IN_PROGRESS: ABT_mutex_lock(transfer->sync); is_complete = (transfer->complete == 1); ABT_mutex_unlock(transfer->sync); break; - case UNIFYFS_IOREQ_STATE_CANCELED: - case UNIFYFS_IOREQ_STATE_COMPLETED: + case UNIFYFS_REQ_STATE_CANCELED: + case UNIFYFS_REQ_STATE_COMPLETED: is_complete = true; break; default: @@ -207,7 +207,7 @@ int client_cleanup_transfer(unifyfs_client* client, unifyfs_transfer_request* req = transfer->req; debug_print_transfer_req(req); - if ((req->state == UNIFYFS_IOREQ_STATE_COMPLETED) && + if ((req->state == UNIFYFS_REQ_STATE_COMPLETED) && (req->mode == UNIFYFS_TRANSFER_MODE_MOVE)) { /* successful copy, now remove source */ if (transfer->src_in_unify) { @@ -238,10 +238,13 @@ int client_cleanup_transfer(unifyfs_client* client, } /* Update the transfer status for the client (app_id + client_id) - * transfer request (transfer_id) using the given error_code */ + * transfer request (transfer_id) using the given error_code, transfer + * size, and transfer time */ int client_complete_transfer(unifyfs_client* client, int transfer_id, - int error_code) + int error_code, + size_t transfer_size_bytes, + double transfer_time_seconds) { if (NULL == client) { LOGERR("NULL client"); @@ -264,7 +267,9 @@ int client_complete_transfer(unifyfs_client* client, /* update the request status */ ABT_mutex_lock(transfer->sync); req->result.error = error_code; - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->result.file_size_bytes = transfer_size_bytes; + req->result.transfer_time_seconds = transfer_time_seconds; + req->state = UNIFYFS_REQ_STATE_COMPLETED; transfer->complete = 1; ABT_mutex_unlock(transfer->sync); @@ -289,7 +294,9 @@ int client_submit_transfers(unifyfs_client* client, default: req->result.error = EINVAL; req->result.rc = UNIFYFS_FAILURE; - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->result.file_size_bytes = 0; + req->result.transfer_time_seconds = 0.0; + req->state = UNIFYFS_REQ_STATE_COMPLETED; continue; } @@ -305,7 +312,7 @@ int client_submit_transfers(unifyfs_client* client, (src_in_unify && dst_in_unify)) { rc = EINVAL; } else { - req->state = UNIFYFS_IOREQ_STATE_IN_PROGRESS; + req->state = UNIFYFS_REQ_STATE_IN_PROGRESS; rc = client_create_transfer(client, req, src_in_unify); if (UNIFYFS_SUCCESS == rc) { if (src_in_unify) { @@ -322,8 +329,10 @@ int client_submit_transfers(unifyfs_client* client, if (rc != UNIFYFS_SUCCESS) { req->result.error = rc; req->result.rc = UNIFYFS_FAILURE; + req->result.file_size_bytes = 0; + req->result.transfer_time_seconds = 0.0; + req->state = UNIFYFS_REQ_STATE_COMPLETED; ret = UNIFYFS_FAILURE; - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; } } diff --git a/client/src/client_transfer.h b/client/src/client_transfer.h index b3a76131d..7cc1f10b0 100644 --- a/client/src/client_transfer.h +++ b/client/src/client_transfer.h @@ -48,10 +48,13 @@ int client_cleanup_transfer(unifyfs_client* client, client_transfer_status* transfer); /* Update the transfer status for the client (app_id + client_id) - * transfer request (transfer_id) using the given error_code */ + * transfer request (transfer_id) using the given error_code, transfer + * size, and transfer time */ int client_complete_transfer(unifyfs_client* client, int transfer_id, - int error_code); + int error_code, + size_t transfer_size_bytes, + double transfer_time_seconds); /* Given an array of transfer requests, submit each request after * creating a transfer status structure for the request and storing it diff --git a/client/src/margo_client.c b/client/src/margo_client.c index b3cc7f2b5..0151fa646 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -1106,17 +1106,23 @@ static void unifyfs_transfer_complete_rpc(hg_handle_t handle) } else { /* lookup client transfer request */ unifyfs_client* client; - int client_app = (int) in.app_id; - int client_id = (int) in.client_id; - int transfer_id = (int) in.transfer_id; - int error_code = (int) in.error_code; + int client_app = (int) in.app_id; + int client_id = (int) in.client_id; + int transfer_id = (int) in.transfer_id; + size_t transfer_sz = (size_t) in.transfer_size_bytes; + uint32_t xfer_sec = (uint32_t) in.transfer_time_sec; + uint32_t xfer_usec = (uint32_t) in.transfer_time_usec; + int error_code = (int) in.error_code; client = unifyfs_find_client(client_app, client_id, NULL); if (NULL == client) { /* unknown client */ ret = EINVAL; } else { /* Update the transfer state */ - ret = client_complete_transfer(client, transfer_id, error_code); + double transfer_time = (double) xfer_sec; + transfer_time += (double) xfer_usec / 1000000.0; + ret = client_complete_transfer(client, transfer_id, error_code, + transfer_sz, transfer_time); } margo_free_input(handle, &in); } diff --git a/client/src/posix_client.c b/client/src/posix_client.c index 975f357ca..d17840cac 100644 --- a/client/src/posix_client.c +++ b/client/src/posix_client.c @@ -895,7 +895,7 @@ int unifyfs_transfer_file(const char* src, free(dst_path); return -err; } - close(fd); + UNIFYFS_WRAP(close)(fd); } if (mode == UNIFYFS_TRANSFER_PARALLEL) { diff --git a/client/src/unifyfs-stdio.c b/client/src/unifyfs-stdio.c index b7206101e..e401c6076 100644 --- a/client/src/unifyfs-stdio.c +++ b/client/src/unifyfs-stdio.c @@ -261,7 +261,7 @@ static int unifyfs_fopen_parse_mode( } } - return UNIFYFS_SUCCESS; + return 0; } /* @@ -269,10 +269,9 @@ static int unifyfs_fopen_parse_mode( * fopen mode semantics, initializes outstream and returns UNIFYFS_SUCCESS if * successful. Returns some other UNIFYFS/errno error otherwise. */ -static int unifyfs_fopen( - const char* path, - const char* mode, - FILE** outstream) +static int unifyfs_fopen(const char* path, + const char* mode, + FILE** outstream) { /* assume that we'll fail */ *outstream = NULL; @@ -280,7 +279,7 @@ static int unifyfs_fopen( /* parse the fopen mode string */ int read, write, append, plus; int parse_rc = unifyfs_fopen_parse_mode(mode, &read, &write, &append, &plus); - if (parse_rc != UNIFYFS_SUCCESS) { + if (parse_rc) { return parse_rc; } @@ -541,17 +540,16 @@ static int unifyfs_stream_read( if (filedesc == NULL) { /* ERROR: invalid file descriptor */ s->err = 1; - errno = EBADF; LOGDBG("Invalid file descriptor"); + errno = EBADF; return EBADF; } /* bail with error if stream not open for reading */ if (!filedesc->read) { s->err = 1; - errno = EBADF; LOGDBG("Stream not open for reading"); - + errno = EBADF; return EBADF; } @@ -562,8 +560,8 @@ static int unifyfs_stream_read( if (setvbuf_rc != UNIFYFS_SUCCESS) { /* ERROR: failed to associate buffer */ s->err = 1; - errno = unifyfs_rc_errno(setvbuf_rc); LOGDBG("Couldn't setvbuf"); + errno = unifyfs_rc_errno(setvbuf_rc); return setvbuf_rc; } } @@ -976,7 +974,7 @@ static int unifyfs_fseek(FILE* stream, off_t offset, int whence) /* clear end-of-file indicator */ s->eof = 0; - + errno = 0; return 0; } @@ -991,6 +989,7 @@ FILE* UNIFYFS_WRAP(fopen)(const char* path, const char* mode) errno = unifyfs_rc_errno(rc); return NULL; } + errno = 0; return stream; } else { MAP_OR_FAIL(fopen); @@ -1004,8 +1003,8 @@ FILE* UNIFYFS_WRAP(freopen)(const char* path, const char* mode, FILE* stream) /* check whether we should intercept this path */ if (unifyfs_intercept_stream(stream)) { /* return file descriptor associated with stream */ - unifyfs_unsupported_stream(stream, __func__, __FILE__, __LINE__, "new file %s", - path); + unifyfs_unsupported_stream(stream, __func__, __FILE__, __LINE__, + "new file %s", path); /* lookup stream */ unifyfs_stream_t* s = (unifyfs_stream_t*) stream; @@ -1028,6 +1027,7 @@ int UNIFYFS_WRAP(setvbuf)(FILE* stream, char* buf, int type, size_t size) errno = unifyfs_rc_errno(rc); return 1; } + errno = 0; return 0; } else { MAP_OR_FAIL(setvbuf); @@ -1179,6 +1179,7 @@ int UNIFYFS_WRAP(fputc)(int c, FILE* stream) } /* return value written */ + errno = 0; return (int) charbuf; } else { MAP_OR_FAIL(fputc); @@ -1203,6 +1204,7 @@ int UNIFYFS_WRAP(getc)(FILE* stream) } /* return byte read cast as an int */ + errno = 0; return (int) charbuf; } else { MAP_OR_FAIL(getc); @@ -1226,6 +1228,7 @@ int UNIFYFS_WRAP(putc)(int c, FILE* stream) } /* return value written */ + errno = 0; return (int) charbuf; } else { MAP_OR_FAIL(putc); @@ -1287,7 +1290,7 @@ char* UNIFYFS_WRAP(fgets)(char* s, int n, FILE* stream) /* terminate string with a NUL */ s[limit] = '\0'; - + errno = 0; return s; } else { MAP_OR_FAIL(fgets); @@ -1314,6 +1317,7 @@ int UNIFYFS_WRAP(fputs)(const char* s, FILE* stream) } /* return success */ + errno = 0; return 0; } else { MAP_OR_FAIL(fputs); @@ -1345,6 +1349,7 @@ size_t UNIFYFS_WRAP(fread)(void* ptr, size_t size, size_t nitems, FILE* stream) } /* return number of items read */ + errno = 0; if (retcount < count) { /* adjust return value if read less data than requested */ size_t nitems_read = retcount / size; @@ -1382,6 +1387,7 @@ size_t UNIFYFS_WRAP(fwrite)(const void* ptr, size_t size, size_t nitems, } /* return number of items written */ + errno = 0; return nitems; } else { MAP_OR_FAIL(fwrite); @@ -1438,6 +1444,7 @@ int UNIFYFS_WRAP(vfprintf)(FILE* stream, const char* format, va_list ap) free(str); /* return number of bytes written */ + errno = 0; return chars; } else { va_list ap2; @@ -1520,8 +1527,7 @@ int UNIFYFS_WRAP(fseek)(FILE* stream, long offset, int whence) /* check whether we should intercept this stream */ if (unifyfs_intercept_stream(stream)) { off_t offset_offt = (off_t) offset; - int rc = unifyfs_fseek(stream, offset_offt, whence); - return rc; + return unifyfs_fseek(stream, offset_offt, whence); } else { MAP_OR_FAIL(fseek); int ret = UNIFYFS_REAL(fseek)(stream, offset, whence); @@ -1533,8 +1539,7 @@ int UNIFYFS_WRAP(fseeko)(FILE* stream, off_t offset, int whence) { /* check whether we should intercept this stream */ if (unifyfs_intercept_stream(stream)) { - int rc = unifyfs_fseek(stream, offset, whence); - return rc; + return unifyfs_fseek(stream, offset, whence); } else { MAP_OR_FAIL(fseeko); int ret = UNIFYFS_REAL(fseeko)(stream, offset, whence); @@ -1561,6 +1566,7 @@ long UNIFYFS_WRAP(ftell)(FILE* stream) /* get current position */ off_t current_pos = filedesc->pos; + errno = 0; return (long)current_pos; } else { MAP_OR_FAIL(ftell); @@ -1587,6 +1593,7 @@ off_t UNIFYFS_WRAP(ftello)(FILE* stream) /* get current position */ off_t current_pos = filedesc->pos; + errno = 0; return current_pos; } else { MAP_OR_FAIL(ftello); @@ -1613,14 +1620,13 @@ void UNIFYFS_WRAP(rewind)(FILE* stream) /* seek to front of file */ int rc = unifyfs_fseek(stream, (off_t) 0L, SEEK_SET); - /* set errno */ - errno = unifyfs_rc_errno(rc); - /* clear error indicator if seek successful */ if (rc == 0) { s->err = 0; } + /* set errno and return */ + errno = unifyfs_rc_errno(rc); return; } else { MAP_OR_FAIL(rewind); @@ -1669,7 +1675,7 @@ int UNIFYFS_WRAP(fgetpos)(FILE* stream, fpos_t* pos) /* save pointer to state in output parameter */ void** ptr = (void**) pos; *ptr = (void*) state; - + errno = 0; return 0; } else { MAP_OR_FAIL(fgetpos); @@ -1741,6 +1747,9 @@ int UNIFYFS_WRAP(fflush)(FILE* stream) } } + if (!ret) { + errno = 0; + } return ret; } @@ -1753,7 +1762,7 @@ int UNIFYFS_WRAP(fflush)(FILE* stream) /* ERROR: flush sets error indicator and errno */ return EOF; } - + errno = 0; return 0; } else { MAP_OR_FAIL(fflush); @@ -1763,7 +1772,7 @@ int UNIFYFS_WRAP(fflush)(FILE* stream) } /* return non-zero if and only if end-of-file indicator is set - * for stream */ + * for stream (does not set errno) */ int UNIFYFS_WRAP(feof)(FILE* stream) { /* check whether we should intercept this stream */ @@ -1839,6 +1848,7 @@ int UNIFYFS_WRAP(fileno)(FILE* stream) /* return file descriptor associated with stream but don't conflict * with active system fds that range from 0 - (fd_limit) */ ret = fd + unifyfs_fd_limit; + errno = 0; return ret; } else { MAP_OR_FAIL(fileno); @@ -1903,6 +1913,7 @@ int UNIFYFS_WRAP(fclose)(FILE* stream) unifyfs_stack_push(posix_stream_stack, s->sid); /* currently a no-op */ + errno = 0; return 0; } else { MAP_OR_FAIL(fclose); diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index dd0826909..96d8c54a0 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -66,6 +66,7 @@ int UNIFYFS_WRAP(access)(const char* path, int mode) /* currently a no-op */ LOGDBG("access: path intercepted, returning 0, %s", upath); + errno = 0; return 0; } else { LOGDBG("access: calling MAP_OR_FAIL, %s", path); @@ -102,6 +103,7 @@ int UNIFYFS_WRAP(mkdir)(const char* path, mode_t mode) } /* success */ + errno = 0; return 0; } else { MAP_OR_FAIL(mkdir); @@ -120,8 +122,7 @@ static int _unifyfs_remove(const char* upath, mode_t mask) { /* check if the mount point itself is being deleted */ if (!strcmp(upath, unifyfs_mount_prefix)) { - errno = EBUSY; - return -1; + return EBUSY; } /* check if path exists locally */ @@ -136,8 +137,7 @@ static int _unifyfs_remove(const char* upath, mode_t mask) if (mask & S_IFREG) { /* ERROR: unlink likely made this request but path is a dir */ LOGDBG("Attempting to unlink a directory %s in UNIFYFS", upath); - errno = EISDIR; - return -1; + return EISDIR; } /* remove/rmdir likely made this request (mask & (0 | S_IFDIR)) */ @@ -146,16 +146,14 @@ static int _unifyfs_remove(const char* upath, mode_t mask) /* ERROR: is a directory, but isn't empty */ LOGDBG("Attempting to remove non-empty dir %s in UNIFYFS", upath); - errno = ENOTEMPTY; - return -1; + return ENOTEMPTY; } } else { /* not a directory */ /* check if remove request was for a directory */ if (mask & S_IFDIR) { /* ERROR: rmdir likely made this request but path not a dir */ LOGDBG("Attempting to rmdir a non-dir %s in UNIFYFS", upath); - errno = ENOTDIR; - return -1; + return ENOTDIR; } } @@ -165,8 +163,7 @@ static int _unifyfs_remove(const char* upath, mode_t mask) /* failed to remove the target, * set errno and return */ LOGDBG("remove failed on %s in UNIFYFS", upath); - errno = unifyfs_rc_errno(ret); - return -1; + return unifyfs_rc_errno(ret); } } else { /* path doesn't exist locally, but may exist globally */ @@ -177,8 +174,7 @@ static int _unifyfs_remove(const char* upath, mode_t mask) if (ret != UNIFYFS_SUCCESS) { /* ERROR: path doesn't exist locally or globally */ LOGDBG("Couldn't find entry for %s in UNIFYFS", upath); - errno = ENOENT; - return -1; + return ENOENT; } /* is it a directory? */ @@ -187,8 +183,7 @@ static int _unifyfs_remove(const char* upath, mode_t mask) if (mask & S_IFREG) { /* ERROR: unlink likely made this request but path is a dir */ LOGDBG("Attempting to unlink a directory %s in UNIFYFS", upath); - errno = EISDIR; - return -1; + return EISDIR; } /* Current directory structure assumes all directories are empty. @@ -201,8 +196,7 @@ static int _unifyfs_remove(const char* upath, mode_t mask) /* check if remove request was for a directory */ if ((mask & S_IFDIR)) { LOGDBG("Attempting to rmdir a non-dir %s in UNIFYFS", upath); - errno = ENOTDIR; - return -1; + return ENOTDIR; } } @@ -210,8 +204,7 @@ static int _unifyfs_remove(const char* upath, mode_t mask) ret = invoke_client_unlink_rpc(posix_client, gfid); if (ret != UNIFYFS_SUCCESS) { LOGDBG("unlink rpc failed on %s in UNIFYFS", upath); - errno = unifyfs_rc_errno(ret); - return -1; + return unifyfs_rc_errno(ret); } } @@ -226,12 +219,14 @@ int UNIFYFS_WRAP(rmdir)(const char* path) if (unifyfs_intercept_path(path, upath)) { /* call shared logic function with S_IFDIR mask */ int ret = _unifyfs_remove(upath, S_IFDIR); - if (ret != UNIFYFS_SUCCESS) { + if (ret) { LOGDBG("rmdir() failed on %s in UNIFYFS", upath); + errno = ret; return -1; } /* success */ + errno = 0; return 0; } else { MAP_OR_FAIL(rmdir); @@ -247,10 +242,16 @@ int UNIFYFS_WRAP(chdir)(const char* path) if (unifyfs_intercept_path(path, upath)) { /* TODO: check that path is not a file? */ /* we're happy to change into any directory in unifyfs */ + char* upath_copy = strdup(upath); + if (upath_copy == NULL) { + errno = ENOMEM; + return -1; + } if (posix_client->cwd != NULL) { free(posix_client->cwd); } - posix_client->cwd = strdup(upath); + posix_client->cwd = upath_copy; + errno = 0; return 0; } else { MAP_OR_FAIL(chdir); @@ -315,6 +316,7 @@ static char* _getcwd_impl(char* path, size_t size) buf = (char*) malloc(size); if (buf != NULL) { strncpy(buf, posix_client->cwd, size); + errno = 0; } else { errno = ENOMEM; } @@ -331,6 +333,7 @@ static char* _getcwd_impl(char* path, size_t size) buf = (char*) malloc(len); if (buf != NULL) { strlcpy(buf, posix_client->cwd, len); + errno = 0; } else { errno = ENOMEM; } @@ -342,6 +345,7 @@ static char* _getcwd_impl(char* path, size_t size) if (len <= size) { /* current working dir fits, copy and return */ strncpy(path, posix_client->cwd, size); + errno = 0; return path; } else { /* user's buffer is too small */ @@ -470,6 +474,7 @@ char* UNIFYFS_WRAP(getwd)(char* path) size_t len = strlen(posix_client->cwd) + 1; if (len <= PATH_MAX) { strncpy(path, posix_client->cwd, PATH_MAX); + errno = 0; return path; } else { /* user's buffer is too small */ @@ -521,6 +526,8 @@ char* UNIFYFS_WRAP(get_current_dir_name)(void) char* ret = strdup(posix_client->cwd); if (ret == NULL) { errno = ENOMEM; + } else { + errno = 0; } return ret; } else { @@ -601,6 +608,7 @@ int UNIFYFS_WRAP(rename)(const char* oldpath, const char* newpath) pthread_mutex_unlock(&(posix_client->sync)); /* success */ + errno = 0; return 0; } else { /* for now, we can only rename within our file system */ @@ -647,6 +655,7 @@ int UNIFYFS_WRAP(truncate)(const char* path, off_t length) } /* success */ + errno = 0; return 0; } else { MAP_OR_FAIL(truncate); @@ -662,12 +671,14 @@ int UNIFYFS_WRAP(unlink)(const char* path) if (unifyfs_intercept_path(path, upath)) { /* call shared logic function with S_IFREG mask */ int ret = _unifyfs_remove(upath, S_IFREG); - if (ret != UNIFYFS_SUCCESS) { + if (ret) { LOGDBG("unlink() failed on %s in UNIFYFS", upath); + errno = ret; return -1; } /* success */ + errno = 0; return 0; } else { MAP_OR_FAIL(unlink); @@ -683,12 +694,14 @@ int UNIFYFS_WRAP(remove)(const char* path) if (unifyfs_intercept_path(path, upath)) { /* call shared logic function with 0 mask */ int ret = _unifyfs_remove(upath, 0); - if (ret != UNIFYFS_SUCCESS) { + if (ret) { LOGDBG("remove() failed on %s in UNIFYFS", upath); + errno = ret; return -1; } /* success */ + errno = 0; return 0; } else { MAP_OR_FAIL(remove); @@ -770,7 +783,7 @@ static int __stat(const char* path, struct stat* buf) /* copy attributes to stat struct */ unifyfs_file_attr_to_stat(&fattr, buf); - + errno = 0; return 0; } @@ -938,6 +951,8 @@ int UNIFYFS_WRAP(statfs)(const char* path, struct statfs* fsbuf) if (ret) { errno = ret; ret = -1; + } else { + errno = 0; } } else { MAP_OR_FAIL(statfs); @@ -959,6 +974,8 @@ int UNIFYFS_WRAP(fstatfs)(int fd, struct statfs* fsbuf) if (ret) { errno = ret; ret = -1; + } else { + errno = 0; } } else { MAP_OR_FAIL(fstatfs); @@ -1123,6 +1140,7 @@ static int posix_create(char* upath, mode_t mode) int ret = fd + unifyfs_fd_limit; LOGDBG("using fds (internal=%d, external=%d) for fid %d file %s", fd, ret, fid, upath); + errno = 0; return ret; } @@ -1203,6 +1221,7 @@ int UNIFYFS_WRAP(open)(const char* path, int flags, ...) ret = fd + unifyfs_fd_limit; LOGDBG("using fds (internal=%d, external=%d) for fid %d file %s", fd, ret, fid, upath); + errno = 0; return ret; } else { MAP_OR_FAIL(open); @@ -1375,6 +1394,7 @@ off_t UNIFYFS_WRAP(lseek)(int fd, off_t offset, int whence) /* set and return final file position */ filedesc->pos = current_pos; + errno = 0; return current_pos; } else { MAP_OR_FAIL(lseek); @@ -1492,6 +1512,7 @@ ssize_t UNIFYFS_WRAP(read)(int fd, void* buf, size_t count) filedesc->pos += (off_t)bytes; /* return number of bytes read */ + errno = 0; return (ssize_t)bytes; } else { MAP_OR_FAIL(read); @@ -1540,6 +1561,7 @@ ssize_t UNIFYFS_WRAP(write)(int fd, const void* buf, size_t count) filedesc->pos = pos + bytes; /* return number of bytes written */ + errno = 0; return (ssize_t)bytes; } else { MAP_OR_FAIL(write); @@ -1569,6 +1591,7 @@ ssize_t UNIFYFS_WRAP(readv)(int fd, const struct iovec* iov, int iovcnt) ret += rret; } } + errno = 0; return ret; } else { MAP_OR_FAIL(readv); @@ -1599,6 +1622,7 @@ ssize_t UNIFYFS_WRAP(writev)(int fd, const struct iovec* iov, int iovcnt) } } } + errno = 0; return ret; } else { MAP_OR_FAIL(writev); @@ -1717,6 +1741,8 @@ int UNIFYFS_WRAP(lio_listio)(int mode, struct aiocb* const aiocb_list[], if (ret) { errno = unifyfs_rc_errno(ret); ret = -1; + } else { + errno = 0; } return ret; } @@ -1770,6 +1796,7 @@ ssize_t UNIFYFS_WRAP(pread)(int fd, void* buf, size_t count, off_t offset) } /* return number of bytes read */ + errno = 0; return retcount; } else { MAP_OR_FAIL(pread); @@ -1817,6 +1844,7 @@ ssize_t UNIFYFS_WRAP(pwrite)(int fd, const void* buf, size_t count, } /* return number of bytes written */ + errno = 0; return (ssize_t)bytes; } else { MAP_OR_FAIL(pwrite); @@ -1861,6 +1889,7 @@ int UNIFYFS_WRAP(fchdir)(int fd) free(posix_client->cwd); } posix_client->cwd = strdup(path); + errno = 0; return 0; } else { MAP_OR_FAIL(fchdir); @@ -1924,6 +1953,7 @@ int UNIFYFS_WRAP(ftruncate)(int fd, off_t length) return -1; } + errno = 0; return 0; } else { MAP_OR_FAIL(ftruncate); @@ -1959,6 +1989,8 @@ int UNIFYFS_WRAP(fsync)(int fd) return -1; } } + + errno = 0; return 0; } else { MAP_OR_FAIL(fsync); @@ -1990,6 +2022,7 @@ int UNIFYFS_WRAP(flock)(int fd, int operation) /* check whether we should intercept this file descriptor */ if (unifyfs_intercept_fd(&fd)) { // KMM I removed the locking code because it was causing hangs + errno = 0; return 0; } else { MAP_OR_FAIL(flock); @@ -2157,6 +2190,7 @@ int UNIFYFS_WRAP(close)(int fd) /* add file descriptor back to free stack */ unifyfs_stack_push(posix_fd_stack, fd); + errno = 0; return 0; } else { MAP_OR_FAIL(close); @@ -2239,6 +2273,7 @@ static int __chmod(int fid, mode_t mode) /* update global size of file from global metadata */ unifyfs_fid_update_file_meta(posix_client, fid, &attr); + errno = 0; return 0; } diff --git a/client/src/unifyfs_api.h b/client/src/unifyfs_api.h index 45d955218..5cb8c654d 100644 --- a/client/src/unifyfs_api.h +++ b/client/src/unifyfs_api.h @@ -48,6 +48,14 @@ typedef uint32_t unifyfs_gfid; /* a valid gfid generated via MD5 hash will never be zero */ #define UNIFYFS_INVALID_GFID ((unifyfs_gfid)0) +/* enumeration of request states */ +typedef enum unifyfs_req_state { + UNIFYFS_REQ_STATE_INVALID = 0, + UNIFYFS_REQ_STATE_IN_PROGRESS, + UNIFYFS_REQ_STATE_CANCELED, + UNIFYFS_REQ_STATE_COMPLETED +} unifyfs_req_state; + /* enumeration of supported I/O request operations */ typedef enum unifyfs_ioreq_op { UNIFYFS_IOREQ_NOP = 0, @@ -59,14 +67,6 @@ typedef enum unifyfs_ioreq_op { UNIFYFS_IOREQ_OP_ZERO, } unifyfs_ioreq_op; -/* enumeration of I/O request states */ -typedef enum unifyfs_ioreq_state { - UNIFYFS_IOREQ_STATE_INVALID = 0, - UNIFYFS_IOREQ_STATE_IN_PROGRESS, - UNIFYFS_IOREQ_STATE_CANCELED, - UNIFYFS_IOREQ_STATE_COMPLETED -} unifyfs_ioreq_state; - /* structure to hold I/O request result values */ typedef struct unifyfs_ioreq_result { int error; @@ -90,7 +90,7 @@ typedef struct unifyfs_io_request { */ /* status/result fields */ - unifyfs_ioreq_state state; + unifyfs_req_state state; unifyfs_ioreq_result result; /* internal fields */ @@ -104,6 +104,14 @@ typedef enum unifyfs_transfer_mode { UNIFYFS_TRANSFER_MODE_MOVE // copy, then remove source } unifyfs_transfer_mode; +/* structure to hold transfer request result values */ +typedef struct unifyfs_transfer_result { + int error; + int rc; + size_t file_size_bytes; + double transfer_time_seconds; +} unifyfs_transfer_result; + /* File transfer request structure */ typedef struct unifyfs_transfer_request { /* user-specified fields */ @@ -119,21 +127,21 @@ typedef struct unifyfs_transfer_request { */ /* status/result fields */ - unifyfs_ioreq_state state; - unifyfs_ioreq_result result; + unifyfs_req_state state; + unifyfs_transfer_result result; /* internal fields */ int _reqid; } unifyfs_transfer_request; /* Global file status struct */ -typedef struct unifyfs_status { +typedef struct unifyfs_file_status { int laminated; int mode; off_t local_file_size; off_t global_file_size; size_t local_write_nbytes; -} unifyfs_status; +} unifyfs_file_status; /* @@ -219,7 +227,7 @@ unifyfs_rc unifyfs_open(unifyfs_handle fshdl, */ unifyfs_rc unifyfs_stat(unifyfs_handle fshdl, const unifyfs_gfid gfid, - unifyfs_status* st); + unifyfs_file_status* st); /* * Synchronize client writes with global metadata. After successful @@ -349,6 +357,11 @@ unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, const int waitall); +/* check if client mountpoint is prefix of given filepath */ +bool is_unifyfs_path(unifyfs_handle fshdl, + const char* filepath); + + #ifdef __cplusplus } // extern "C" #endif diff --git a/client/src/unifyfs_api_file.c b/client/src/unifyfs_api_file.c index e9bdadf07..bfb952c0d 100644 --- a/client/src/unifyfs_api_file.c +++ b/client/src/unifyfs_api_file.c @@ -21,9 +21,14 @@ * Internal Methods */ -bool is_unifyfs_path(unifyfs_client* client, +bool is_unifyfs_path(unifyfs_handle fshdl, const char* filepath) { + if ((UNIFYFS_INVALID_HANDLE == fshdl) || (NULL == filepath)) { + return false; + } + unifyfs_client* client = fshdl; + /* the library API expects absolute paths without relative components, * so we don't do any path normalization here */ @@ -143,7 +148,7 @@ unifyfs_rc unifyfs_sync(unifyfs_handle fshdl, /* Get global file status */ unifyfs_rc unifyfs_stat(unifyfs_handle fshdl, const unifyfs_gfid gfid, - unifyfs_status* st) + unifyfs_file_status* st) { if ((UNIFYFS_INVALID_HANDLE == fshdl) || (UNIFYFS_INVALID_GFID == gfid) diff --git a/client/src/unifyfs_api_internal.h b/client/src/unifyfs_api_internal.h index 3431be430..35a4ea4cf 100644 --- a/client/src/unifyfs_api_internal.h +++ b/client/src/unifyfs_api_internal.h @@ -128,8 +128,4 @@ int unifyfs_sync_files(unifyfs_client* client); off_t unifyfs_gfid_filesize(unifyfs_client* client, int gfid); -/* check if client mountpoint is prefix of given filepath */ -bool is_unifyfs_path(unifyfs_client* client, - const char* filepath); - #endif // UNIFYFS_API_INTERNAL_H diff --git a/client/src/unifyfs_api_io.c b/client/src/unifyfs_api_io.c index 37ed288b6..ab41ca6f0 100644 --- a/client/src/unifyfs_api_io.c +++ b/client/src/unifyfs_api_io.c @@ -34,7 +34,7 @@ static int process_gfid_writes(unifyfs_client* client, int fid = unifyfs_fid_from_gfid(client, req->gfid); if (-1 == fid) { - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; req->result.error = EINVAL; continue; } @@ -54,7 +54,7 @@ static int process_gfid_writes(unifyfs_client* client, if (rc != UNIFYFS_SUCCESS) { req->result.error = rc; } - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; if (req->op == UNIFYFS_IOREQ_OP_ZERO) { /* cleanup allocated OP_ZERO buffer */ @@ -78,7 +78,7 @@ static int process_gfid_truncates(unifyfs_client* client, int fid = unifyfs_fid_from_gfid(client, req->gfid); if (-1 == fid) { - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; req->result.error = EINVAL; } @@ -86,7 +86,7 @@ static int process_gfid_truncates(unifyfs_client* client, if (rc != UNIFYFS_SUCCESS) { req->result.error = rc; } - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; } return ret; @@ -106,7 +106,7 @@ static int process_gfid_syncs(unifyfs_client* client, if (req->op == UNIFYFS_IOREQ_OP_SYNC_META) { int fid = unifyfs_fid_from_gfid(client, req->gfid); if (-1 == fid) { - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; req->result.error = EINVAL; } @@ -114,7 +114,7 @@ static int process_gfid_syncs(unifyfs_client* client, if (rc != UNIFYFS_SUCCESS) { req->result.error = rc; } - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; } else if (req->op == UNIFYFS_IOREQ_OP_SYNC_DATA) { /* logio_sync covers all files' data - only do it once */ if (!data_sync_completed) { @@ -125,7 +125,7 @@ static int process_gfid_syncs(unifyfs_client* client, data_sync_completed = 1; } } - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; } } @@ -165,7 +165,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, req = reqs + i; /* set initial request result and state */ - req->state = UNIFYFS_IOREQ_STATE_INVALID; + req->state = UNIFYFS_REQ_STATE_INVALID; req->result.error = UNIFYFS_SUCCESS; req->result.count = 0; req->result.rc = 0; @@ -234,7 +234,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, size_t s_ndx = 0; for (i = 0; i < nreqs; i++) { req = reqs + i; - req->state = UNIFYFS_IOREQ_STATE_IN_PROGRESS; + req->state = UNIFYFS_REQ_STATE_IN_PROGRESS; switch (req->op) { case UNIFYFS_IOREQ_NOP: break; @@ -319,7 +319,7 @@ unifyfs_rc unifyfs_dispatch_io(unifyfs_handle fshdl, s_ndx = 0; for (i = 0; i < nreqs; i++) { req = reqs + i; - req->state = UNIFYFS_IOREQ_STATE_COMPLETED; + req->state = UNIFYFS_REQ_STATE_COMPLETED; switch (req->op) { case UNIFYFS_IOREQ_NOP: break; @@ -397,8 +397,8 @@ unifyfs_rc unifyfs_wait_io(unifyfs_handle fshdl, n_done = 0; for (i = 0; i < nreqs; i++) { unifyfs_io_request* req = reqs + i; - if ((req->state == UNIFYFS_IOREQ_STATE_CANCELED) || - (req->state == UNIFYFS_IOREQ_STATE_COMPLETED)) { + if ((req->state == UNIFYFS_REQ_STATE_CANCELED) || + (req->state == UNIFYFS_REQ_STATE_COMPLETED)) { n_done++; } } diff --git a/client/src/unifyfs_api_transfer.c b/client/src/unifyfs_api_transfer.c index 15c4b82ff..4c00de74b 100644 --- a/client/src/unifyfs_api_transfer.c +++ b/client/src/unifyfs_api_transfer.c @@ -58,8 +58,8 @@ unifyfs_rc unifyfs_cancel_transfer(unifyfs_handle fshdl, for (size_t i = 0; i < nreqs; i++) { unifyfs_transfer_request* req = reqs + i; - if (req->state != UNIFYFS_IOREQ_STATE_COMPLETED) { - req->state = UNIFYFS_IOREQ_STATE_CANCELED; + if (req->state != UNIFYFS_REQ_STATE_COMPLETED) { + req->state = UNIFYFS_REQ_STATE_CANCELED; /* TODO: cancel the transfer */ } @@ -89,7 +89,7 @@ unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, unifyfs_transfer_request* req; client_transfer_status* transfer; size_t i, n_done; - int max_loop = 30000; + int max_loop = 6000; int loop_cnt = 0; do { n_done = 0; @@ -101,8 +101,8 @@ unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, LOGDBG("checked - complete"); n_done++; client_cleanup_transfer(client, transfer); - } else if ((req->state == UNIFYFS_IOREQ_STATE_CANCELED) || - (req->state == UNIFYFS_IOREQ_STATE_COMPLETED)) { + } else if ((req->state == UNIFYFS_REQ_STATE_CANCELED) || + (req->state == UNIFYFS_REQ_STATE_COMPLETED)) { /* this handles the case where we have already cleaned the * transfer status in a prior loop iteration */ n_done++; @@ -122,9 +122,9 @@ unifyfs_rc unifyfs_wait_transfer(unifyfs_handle fshdl, /* TODO: we probably need a timeout mechanism to prevent an infinite * loop when something goes wrong and the transfer status never * gets updated. For now, just using a hardcoded maximum loop - * iteration count that roughly equates to 30 sec */ + * iteration count that roughly equates to 10 min (6000 sec) */ loop_cnt++; - usleep(1000); /* sleep 1 ms */ + usleep(100000); /* sleep 100 ms */ } while (loop_cnt < max_loop); if (loop_cnt == max_loop) { diff --git a/client/src/unifyfs_fid.c b/client/src/unifyfs_fid.c index b4dc7bf2f..e7899c4d3 100644 --- a/client/src/unifyfs_fid.c +++ b/client/src/unifyfs_fid.c @@ -449,7 +449,7 @@ int unifyfs_fid_open( LOGERR("failed to allocate storage space for file %s (fid=%d)", path, fid); unifyfs_fid_delete(client, fid); - return ret; + return ENOMEM; } /* TODO: set meta->mode bits to mode variable */ diff --git a/common/src/unifyfs_client_rpcs.h b/common/src/unifyfs_client_rpcs.h index 4cf731749..31240f875 100644 --- a/common/src/unifyfs_client_rpcs.h +++ b/common/src/unifyfs_client_rpcs.h @@ -167,6 +167,9 @@ MERCURY_GEN_PROC(unifyfs_transfer_complete_in_t, ((int32_t)(app_id)) ((int32_t)(client_id)) ((int32_t)(transfer_id)) + ((hg_size_t)(transfer_size_bytes)) + ((uint32_t)(transfer_time_sec)) + ((uint32_t)(transfer_time_usec)) ((int32_t)(error_code))) MERCURY_GEN_PROC(unifyfs_transfer_complete_out_t, ((int32_t)(ret))) DECLARE_MARGO_RPC_HANDLER(unifyfs_transfer_complete_rpc) diff --git a/common/src/unifyfs_misc.c b/common/src/unifyfs_misc.c index 345daa172..e7375f6be 100644 --- a/common/src/unifyfs_misc.c +++ b/common/src/unifyfs_misc.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Copyright (c) 2022, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2020, UT-Battelle, LLC. + * Copyright 2022, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -20,8 +20,10 @@ #include #include +#include "unifyfs_misc.h" + /* - * Re-implementation of BSD's strlcpy() function. + * Implementation of BSD's strlcpy() function. * * This is a basically a safer version of strlncpy() since it always * NULL-terminates the buffer. Google 'strlcpy' for full documentation. @@ -43,24 +45,37 @@ size_t strlcpy(char* dest, const char* src, size_t size) } /* - * This is a re-implementation of the Linux kernel's scnprintf() function. + * Implementation of the Linux kernel's scnprintf() function. * * It's snprintf() but returns the number of chars actually written into buf[] * not including the '\0'. It also avoids the -Wformat-truncation warnings. */ int scnprintf(char* buf, size_t size, const char* fmt, ...) { - va_list args; - int rc; + va_list args; + int rc; + + va_start(args, fmt); + rc = vsnprintf(buf, size, fmt, args); + va_end(args); - va_start(args, fmt); - rc = vsnprintf(buf, size, fmt, args); - va_end(args); + if (rc >= size) { + /* We truncated */ + return size - 1; + } - if (rc >= size) { - /* We truncated */ - return size - 1; - } + return rc; +} - return rc; + +/* Calculate timestamp difference in seconds */ +double timediff_sec(struct timeval* before, struct timeval* after) +{ + double diff; + if (!before || !after) { + return -1.0F; + } + diff = (double)(after->tv_sec - before->tv_sec); + diff += 0.000001 * ((double)(after->tv_usec) - (double)(before->tv_usec)); + return diff; } diff --git a/common/src/unifyfs_misc.h b/common/src/unifyfs_misc.h index 3678c2ec6..086139c65 100644 --- a/common/src/unifyfs_misc.h +++ b/common/src/unifyfs_misc.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Copyright (c) 2022, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2020, UT-Battelle, LLC. + * Copyright 2022, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -12,11 +12,20 @@ * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -#ifndef __UNIFYFS_MISC__ -#define __UNIFYFS_MISC__ +#ifndef UNIFYFS_MISC_H +#define UNIFYFS_MISC_H +#include + +/* Implementation of BSD's strlcpy() function. */ size_t strlcpy(char* dest, const char* src, size_t size); +/* Version of snprintf() that returns the actual number of characters + * written into the buffer, not including the trailing NUL character */ int scnprintf(char* buf, size_t size, const char* fmt, ...); +/* Calculate timestamp difference in seconds */ +double timediff_sec(struct timeval* before, struct timeval* after); + + #endif diff --git a/docs/library_api.rst b/docs/library_api.rst index 7bb0af843..872084754 100644 --- a/docs/library_api.rst +++ b/docs/library_api.rst @@ -77,7 +77,7 @@ I/O requests take the form of a ``unifyfs_io_request`` structure that includes the target file gfid, the specific I/O operation (``unifyfs_ioreq_op``) to be applied, and associated operation parameters such as the file offset or user buffer and size. The structure also contains fields used for tracking the -status of the request (``unifyfs_ioreq_state``) and operation results +status of the request (``unifyfs_req_state``) and operation results (``unifyfs_ioreq_result``). .. code-block:: C @@ -93,11 +93,11 @@ status of the request (``unifyfs_ioreq_state``) and operation results unifyfs_ioreq_op op; /* status/result fields */ - unifyfs_ioreq_state state; + unifyfs_req_state state; unifyfs_ioreq_result result; } unifyfs_io_request; - /* enumeration of supported I/O request operations */ + /* Enumeration of supported I/O request operations */ typedef enum unifyfs_ioreq_op { UNIFYFS_IOREQ_NOP = 0, UNIFYFS_IOREQ_OP_READ, @@ -108,15 +108,15 @@ status of the request (``unifyfs_ioreq_state``) and operation results UNIFYFS_IOREQ_OP_ZERO, } unifyfs_ioreq_op; - /* enumeration of I/O request states */ - typedef enum unifyfs_ioreq_state { - UNIFYFS_IOREQ_STATE_INVALID = 0, - UNIFYFS_IOREQ_STATE_IN_PROGRESS, - UNIFYFS_IOREQ_STATE_CANCELED, - UNIFYFS_IOREQ_STATE_COMPLETED - } unifyfs_ioreq_state; + /* Enumeration of API request states */ + typedef enum unifyfs_req_state { + UNIFYFS_REQ_STATE_INVALID = 0, + UNIFYFS_REQ_STATE_IN_PROGRESS, + UNIFYFS_REQ_STATE_CANCELED, + UNIFYFS_REQ_STATE_COMPLETED + } unifyfs_req_state; - /* structure to hold I/O request result values */ + /* Structure containing I/O request result values */ typedef struct unifyfs_ioreq_result { int error; int rc; @@ -127,7 +127,7 @@ For the ``unifyfs_ioreq_result`` structure, successful operations will set the ``rc`` and ``count`` fields as applicable to the specific operation type. All operational failures are reported by setting the ``error`` field to a non-zero value corresponding the the operation failure code, which is often a POSIX -errno value. +``errno`` value. File transfer requests use a ``unifyfs_transfer_request`` structure that includes the source and destination file paths, transfer mode, and a flag @@ -147,17 +147,25 @@ status and transfer operation result. int use_parallel; /* status/result fields */ - unifyfs_ioreq_state state; - unifyfs_ioreq_result result; + unifyfs_req_state state; + unifyfs_transfer_result result; } unifyfs_transfer_request; - /* enumeration of supported I/O request operations */ + /* Enumeration of supported I/O request operations */ typedef enum unifyfs_transfer_mode { UNIFYFS_TRANSFER_MODE_INVALID = 0, UNIFYFS_TRANSFER_MODE_COPY, // simple copy to destination UNIFYFS_TRANSFER_MODE_MOVE // copy, then remove source } unifyfs_transfer_mode; + /* File transfer result structure */ + typedef struct unifyfs_transfer_result { + int error; + int rc; + size_t file_size_bytes; + double transfer_time_seconds; + } unifyfs_transfer_result; + ------------------------- Example Library API Usage ------------------------- diff --git a/docs/run.rst b/docs/run.rst index 06ad139bd..dcf7ee220 100644 --- a/docs/run.rst +++ b/docs/run.rst @@ -137,23 +137,23 @@ The transfer subsystem within UnifyFS can be invoked by providing the $ unifyfs start --stage-in=/path/to/input/manifest/file --share-dir=/path/to/shared/file/system -and/or by providing the ``-o|--stage-out``, and consequently required -``-S|--share-dir``, option to ``unifyfs terminate`` to transfer files out of -UnifyFS: +and/or by providing the ``-o|--stage-out`` option to ``unifyfs terminate`` +to transfer files out of UnifyFS: .. code-block:: Bash $ unifyfs terminate --stage-out=/path/to/output/manifest/file --share-dir=/path/to/shared/file/system -A manifest file needs to be provided to the ``start``/``terminate`` commands in -order to specify the desired transfers. +The argument to both staging options is the path to a manifest file that contains +the source and destination file pairs. Both stage-in and stage-out also require +passing the ``-S|--share-dir=`` option. .. _manifest_file_label: Manifest File ^^^^^^^^^^^^^ -UnifyFS's stage functionality requires a manifest file in order to move data. +UnifyFS's file staging functionality requires a manifest file in order to move data. The manifest file contains one or more file copy requests. Each line in the manifest corresponds to one transfer request, and it contains both the source @@ -166,10 +166,11 @@ If either of the filenames contain whitespace or special characters, then both filenames should be surrounded by double-quote characters (") (ASCII character 34 decimal). The double-quote and linefeed end-of-line characters are not supported in any -filenames used in a unifyfs manifest file. Any other characters are allowed, +filenames used in a manifest file. Any other characters are allowed, including control characters. If a filename contains any characters that might -be misinterpreted, then enclosing the filename in double-quotes is always a safe -thing to do. +be misinterpreted, we suggest enclosing the filename in double-quotes. +Comment lines are also allowed, and are indicated by beginning a line with the +``#`` character. Here is an example of a valid stage-in manifest file: @@ -178,28 +179,27 @@ Here is an example of a valid stage-in manifest file: $ [prompt] cat example_stage_in.manifest /scratch/users/me/input_data/input_1.dat /unifyfs/input/input_1.dat + # example comment line /home/users/me/configuration/run_12345.conf /unifyfs/config/run_12345.conf "/home/users/me/file with space.dat" "/unifyfs/file with space.dat" Transfer During Job ******************* -Data can also be transferred in/out of UnifyFS using one of two stand-alone -applications. +Data can also be transferred in/out of UnifyFS using the ``unifyfs-stage`` +helper program. This is the same program used internally by ``unifyfs`` to +provide file staging during server startup and termination. -The stand-alone applications can be invoked at any time while the UnifyFS -servers are up and responding to requests. This allows for bringing in new input +The helper program can be invoked at any time while the UnifyFS servers +are up and responding to requests. This allows for bringing in new input and/or transferring results out to be verified before the job terminates. UnifyFS Stage Executable ^^^^^^^^^^^^^^^^^^^^^^^^ -The ``start``/``terminate`` transfer API stage functionality can also be used -via the stand-alone application ``unifyfs-stage``. This application is installed -in the same directory as the ``unifyfs`` utility (``$UNIFYFS_INSTALL/bin``). +The ``unifyfs-stage`` program is installed in the same directory as the +``unifyfs`` utility (i.e., ``$UNIFYFS_INSTALL/bin``). -This application can be run at any time within a job to transfer new data into -or results out of UnifyFS. A manifest file (see :ref:`above `) needs to be provided as an argument to use this approach. @@ -219,76 +219,37 @@ as an argument to use this approach. "/source/file/path" "/destination/file/path" - One file per line; Specifying directories is not supported. + One file per line; Specifying directories is not currently supported. Available options: - -c, --checksum verify md5 checksum for each transfer - -h, --help print this usage - -m, --mountpoint= use as unifyfs mountpoint + -c, --checksum Verify md5 checksum for each transfer + (default: off) + -h, --help Print usage information + -m, --mountpoint= Use as UnifyFS mountpoint (default: /unifyfs) - -p, --parallel transfer each file in parallel - (experimental) - -s, --share-dir= directory path for creating status file - -v, --verbose print noisy outputs - - Without the '-p, --parallel' option, a file is transferred by a single - process. If the '-p, --parallel' option is specified, each file will be - divided by multiple processes and transferred in parallel. + -p, --parallel Transfer all files concurrently + (default: off, use sequential transfers) + -s, --skewed Use skewed data distribution for stage-in + (default: off, use balanced distribution) + -S, --status-file= Create stage status file at + -v, --verbose Print verbose information + (default: off) + + By default, each file in the manifest will be transferred in sequence (i.e., + only a single file will be in transfer at any given time). If the + '-p, --parallel' option is specified, files in the manifest will be + transferred concurrently. The number of concurrent transfers is limited by + the number of parallel ranks used to execute unifyfs-stage. Examples: .. code-block:: Bash - :caption: Serial Transfer + :caption: Sequential Transfer using a Single Client $ srun -N 1 -n 1 unifyfs-stage $MY_MANIFEST_FILE .. code-block:: Bash - :caption: Parallel Transfer + :caption: Parallel Transfer using 8 Clients (up to 8 concurrent file transfers) $ srun -N 4 -n 8 unifyfs-stage --parallel $MY_MANIFEST_FILE -Transfer Executable -^^^^^^^^^^^^^^^^^^^ - -``$UNIFYFS_INSTALL/libexec/transfer-static`` - -.. note:: - - The ``transfer-gotcha`` executable is currently unusable due to an issue - that is being tracked. - -The transfer API can also be used during the job by invoking the stand-alone -``transfer`` application. It works similarly to the Unix ``cp`` command, with -source and destination, except being aware that it is copying files between an -external file system and internal UnifyFS. - -.. code-block:: Bash - - [prompt]$ transfer-static --help - - Usage: transfer-static [options...] - - Available options: - -d, --debug pause before running test - (handy for attaching in debugger) - -h, --help help message - -m, --mount= use for unifyfs - (default: /unifyfs) - -p, --parallel parallel transfer - -r, --rank= use for transfer (default: 0) - -Examples of using ``transfer-static``: - -.. code-block:: Bash - :caption: Serial Transfer - - $ srun -N 1 -n 1 $UNIFYFS_INSTALL/libexec/transfer-static /path/on/parallelfs/file.dat /unifyfs/file.dat - - $ srun -N 1 -n 1 $UNIFYFS_INSTALL/libexec/transfer-static /unifyfs/output.dat /scratch/my_output/output.dat - -.. code-block:: Bash - :caption: Parallel Transfer - - $ srun -N 4 -n 8 /path/to/libexec/transfer-static --parallel /path/on/parallelfs/file.dat /unifyfs/file.dat - - $ srun -N 4 -n 8 /path/to/libexec/transfer-static --parallel /unifyfs/output.dat /scratch/my_output/output.dat diff --git a/examples/src/testutil_rdwr.h b/examples/src/testutil_rdwr.h index cbea8c766..494796143 100644 --- a/examples/src/testutil_rdwr.h +++ b/examples/src/testutil_rdwr.h @@ -403,7 +403,7 @@ int stat_file(test_cfg* cfg, const char* filepath) #ifdef DISABLE_UNIFYFS return ENOTSUP; #else - unifyfs_status us; + unifyfs_file_status us; unifyfs_rc urc = unifyfs_stat(cfg->fshdl, cfg->gfid, &us); if (UNIFYFS_SUCCESS != urc) { test_print(cfg, "unifyfs_stat(%s, gfid=%d) failed - %s", diff --git a/server/src/margo_server.c b/server/src/margo_server.c index 23d9abff3..dd435798a 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -846,6 +846,9 @@ int invoke_client_mread_req_complete_rpc(int app_id, int invoke_client_transfer_complete_rpc(int app_id, int client_id, int transfer_id, + size_t transfer_sz_bytes, + int transfer_time_sec, + int transfer_time_usec, int error_code) { hg_return_t hret; @@ -857,10 +860,13 @@ int invoke_client_transfer_complete_rpc(int app_id, /* fill input struct */ unifyfs_transfer_complete_in_t in; - in.app_id = (int32_t) app_id; - in.client_id = (int32_t) client_id; - in.transfer_id = (int32_t) transfer_id; - in.error_code = (int32_t) error_code; + in.app_id = (int32_t) app_id; + in.client_id = (int32_t) client_id; + in.transfer_id = (int32_t) transfer_id; + in.transfer_size_bytes = (hg_size_t) transfer_sz_bytes; + in.transfer_time_sec = (uint32_t) transfer_time_sec; + in.transfer_time_usec = (uint32_t) transfer_time_usec; + in.error_code = (int32_t) error_code; /* get handle to rpc function */ hg_id_t rpc_id = unifyfsd_rpc_context->rpcs.client_transfer_complete_id; diff --git a/server/src/margo_server.h b/server/src/margo_server.h index ecff8bafb..36ac113fe 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -109,6 +109,9 @@ int invoke_client_mread_req_complete_rpc(int app_id, int invoke_client_transfer_complete_rpc(int app_id, int client_id, int transfer_id, + size_t transfer_sz_bytes, + int transfer_time_sec, + int transfer_time_usec, int error_code); /* invokes the client unlink callback rpc function */ diff --git a/server/src/unifyfs_fops_rpc.c b/server/src/unifyfs_fops_rpc.c index bbd555112..0cbe37f0c 100644 --- a/server/src/unifyfs_fops_rpc.c +++ b/server/src/unifyfs_fops_rpc.c @@ -136,11 +136,11 @@ int rpc_transfer(unifyfs_fops_ctx_t* ctx, int transfer_mode, const char* dest_file) { - if (TRANSFER_MODE_OWNER == transfer_mode) { + if (SERVER_TRANSFER_MODE_OWNER == transfer_mode) { return unifyfs_invoke_transfer_rpc(ctx->app_id, ctx->client_id, transfer_id, gfid, transfer_mode, dest_file); - } else if (TRANSFER_MODE_LOCAL == transfer_mode) { + } else if (SERVER_TRANSFER_MODE_LOCAL == transfer_mode) { return unifyfs_invoke_broadcast_transfer(ctx->app_id, ctx->client_id, transfer_id, gfid, transfer_mode, dest_file); diff --git a/server/src/unifyfs_global.h b/server/src/unifyfs_global.h index 9bb4d9c83..1f56705bb 100644 --- a/server/src/unifyfs_global.h +++ b/server/src/unifyfs_global.h @@ -57,6 +57,12 @@ #include "unifyfs_server_rpcs.h" +/* server transfer modes */ +typedef enum { + SERVER_TRANSFER_MODE_OWNER = 0, /* owner transfers all data */ + SERVER_TRANSFER_MODE_LOCAL = 1 /* each server transfers local data */ +} transfer_mode_e; + /* Some global variables/structures used throughout the server code */ /* PMI server rank and server count */ diff --git a/server/src/unifyfs_group_rpc.c b/server/src/unifyfs_group_rpc.c index 997ceade4..e839805d7 100644 --- a/server/src/unifyfs_group_rpc.c +++ b/server/src/unifyfs_group_rpc.c @@ -13,6 +13,7 @@ */ #include "unifyfs_group_rpc.h" +#include "unifyfs_p2p_rpc.h" #ifndef UNIFYFS_BCAST_K_ARY @@ -147,6 +148,7 @@ static int get_child_response(coll_request* coll_req, margo_free_output(chdl, out); } + free(out); } return ret; @@ -229,20 +231,29 @@ static coll_request* collective_create(server_rpc_e req_type, { coll_request* coll_req = calloc(1, sizeof(*coll_req)); if (NULL != coll_req) { - LOGDBG("BCAST_RPC: collective(%p) create (type=%d)", - coll_req, req_type); - coll_req->req_type = req_type; - coll_req->resp_hdl = handle; - coll_req->input = input_struct; - coll_req->output = output_struct; - coll_req->output_sz = output_size; - coll_req->bulk_in = bulk_in; - coll_req->bulk_forward = bulk_forward; - coll_req->bulk_buf = bulk_buf; - - unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, tree_root_rank, - UNIFYFS_BCAST_K_ARY, &(coll_req->tree)); - + LOGDBG("BCAST_RPC: collective(%p) create (type=%d, root=%d)", + coll_req, req_type, tree_root_rank); + coll_req->resp_hdl = handle; + coll_req->req_type = req_type; + coll_req->output = output_struct; + coll_req->input = input_struct; + coll_req->bulk_in = bulk_in; + coll_req->output_sz = output_size; + coll_req->bulk_buf = bulk_buf; + coll_req->bulk_forward = bulk_forward; + coll_req->progress_req = MARGO_REQUEST_NULL; + coll_req->progress_hdl = HG_HANDLE_NULL; + coll_req->app_id = -1; + coll_req->client_id = -1; + coll_req->client_req_id = -1; + + int rc = unifyfs_tree_init(glb_pmi_rank, glb_pmi_size, tree_root_rank, + UNIFYFS_BCAST_K_ARY, &(coll_req->tree)); + if (rc) { + LOGERR("unifyfs_tree_init() failed"); + free(coll_req); + return NULL; + } size_t n_children = (size_t) coll_req->tree.child_count; if (n_children) { coll_req->child_hdls = calloc(n_children, sizeof(hg_handle_t)); @@ -257,6 +268,8 @@ static coll_request* collective_create(server_rpc_e req_type, int* ranks = coll_req->tree.child_ranks; for (int i = 0; i < coll_req->tree.child_count; i++) { /* allocate child request handle */ + LOGDBG("collective(%p) - child[%d] is rank=%d", + coll_req, i, ranks[i]); hg_handle_t* chdl = coll_req->child_hdls + i; int rc = get_child_request_handle(op_hgid, ranks[i], chdl); if (rc != UNIFYFS_SUCCESS) { @@ -296,7 +309,7 @@ static void coll_restore_input_bulk(coll_request* coll_req) } } -static void collective_cleanup(coll_request* coll_req) +void collective_cleanup(coll_request* coll_req) { if (NULL == coll_req) { return; @@ -304,10 +317,14 @@ static void collective_cleanup(coll_request* coll_req) LOGDBG("BCAST_RPC: collective(%p) cleanup", coll_req); - /* release communication tree resources */ - unifyfs_tree_free(&(coll_req->tree)); - /* release margo resources */ + if (HG_HANDLE_NULL != coll_req->progress_hdl) { + if (MARGO_REQUEST_NULL != coll_req->progress_req) { + margo_wait(coll_req->progress_req); + } + margo_destroy(coll_req->progress_hdl); + } + if (HG_HANDLE_NULL != coll_req->resp_hdl) { if (NULL != coll_req->input) { coll_restore_input_bulk(coll_req); @@ -315,6 +332,7 @@ static void collective_cleanup(coll_request* coll_req) } margo_destroy(coll_req->resp_hdl); } + if (HG_BULK_NULL != coll_req->bulk_forward) { margo_bulk_free(coll_req->bulk_forward); } @@ -335,6 +353,10 @@ static void collective_cleanup(coll_request* coll_req) if (NULL != coll_req->bulk_buf) { free(coll_req->bulk_buf); } + + /* release communication tree resources */ + unifyfs_tree_free(&(coll_req->tree)); + memset(coll_req, 0, sizeof(*coll_req)); free(coll_req); } @@ -365,8 +387,6 @@ static int collective_forward(coll_request* coll_req) return ret; } - - /* set collective output return value to local result value */ void collective_set_local_retval(coll_request* coll_req, int val) { @@ -413,8 +433,9 @@ void collective_set_local_retval(coll_request* coll_req, int val) } } -/* Forward the collective request to any children */ -static int collective_finish(coll_request* coll_req) +/* finish collective process by waiting for any child responses and + * sending parent response (if applicable) */ +int collective_finish(coll_request* coll_req) { int ret = UNIFYFS_SUCCESS; @@ -438,8 +459,6 @@ static int collective_finish(coll_request* coll_req) coll_req, (int)(coll_req->req_type)); } - collective_cleanup(coll_req); - return ret; } @@ -460,10 +479,9 @@ int invoke_bcast_progress_rpc(coll_request* coll_req) } /* get handle to local rpc function */ - hg_handle_t handle; hg_id_t hgid = unifyfsd_rpc_context->rpcs.bcast_progress_id; hg_return_t hret = margo_create(unifyfsd_rpc_context->svr_mid, addr, - hgid, &handle); + hgid, &(coll_req->progress_hdl)); if (hret != HG_SUCCESS) { LOGERR("failed to get handle for bcast progress - %s", HG_Error_to_string(hret)); @@ -473,7 +491,8 @@ int invoke_bcast_progress_rpc(coll_request* coll_req) * by a ULT */ bcast_progress_in_t in; in.coll_req = (hg_ptr_t) coll_req; - hret = margo_forward(handle, &in); + hret = margo_iforward(coll_req->progress_hdl, &in, + &(coll_req->progress_req)); if (hret != HG_SUCCESS) { LOGERR("failed to forward bcast progress for coll(%p) - %s", HG_Error_to_string(hret), coll_req); @@ -489,6 +508,7 @@ static void bcast_progress_rpc(hg_handle_t handle) { /* assume we'll succeed */ int32_t ret = UNIFYFS_SUCCESS; + coll_request* coll = NULL; bcast_progress_in_t in; hg_return_t hret = margo_get_input(handle, &in); @@ -497,7 +517,7 @@ static void bcast_progress_rpc(hg_handle_t handle) ret = UNIFYFS_ERROR_MARGO; } else { /* call collective_finish() to progress bcast operation */ - coll_request* coll = (coll_request*) in.coll_req; + coll = (coll_request*) in.coll_req; LOGDBG("BCAST_RPC: bcast progress collective(%p)", coll); ret = collective_finish(coll); if (ret != UNIFYFS_SUCCESS) { @@ -514,6 +534,10 @@ static void bcast_progress_rpc(hg_handle_t handle) LOGERR("margo_respond() failed - %s", HG_Error_to_string(hret)); } + if (NULL != coll) { + collective_cleanup(coll); + } + /* free margo resources */ margo_free_input(handle, &in); margo_destroy(handle); @@ -917,40 +941,20 @@ int unifyfs_invoke_broadcast_transfer(int client_app, /* assuming success */ int ret = UNIFYFS_SUCCESS; - /* get attributes and extents metadata */ - unifyfs_file_attr_t attrs; - ret = unifyfs_inode_metaget(gfid, &attrs); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("failed to get file attributes for gfid=%d", gfid); - return ret; - } - - if (!attrs.is_shared) { - /* no need to broadcast for private files */ - LOGDBG("gfid=%d is private, not broadcasting", gfid); - return UNIFYFS_SUCCESS; - } - - ret = sm_transfer(glb_pmi_rank, client_app, client_id, transfer_id, gfid, - transfer_mode, dest_file, NULL); - if (ret != UNIFYFS_SUCCESS) { - LOGERR("sm_transfer() at root failed for gfid=%d", gfid); - return ret; - } - LOGDBG("BCAST_RPC: starting transfer(mode=%d) for gfid=%d to file %s", transfer_mode, gfid, dest_file); coll_request* coll = NULL; transfer_bcast_in_t* in = calloc(1, sizeof(*in)); - if (NULL == in) { + server_rpc_req_t* req = calloc(1, sizeof(*req)); + if ((NULL == in) || (NULL == req)) { ret = ENOMEM; } else { /* set input params */ in->root = (int32_t) glb_pmi_rank; in->gfid = (int32_t) gfid; in->mode = (int32_t) transfer_mode; - in->dst_file = (hg_const_string_t) dest_file; + in->dst_file = (hg_const_string_t) strdup(dest_file); hg_id_t op_hgid = unifyfsd_rpc_context->rpcs.transfer_bcast_id; server_rpc_e rpc = UNIFYFS_SERVER_BCAST_RPC_TRANSFER; @@ -961,9 +965,21 @@ int unifyfs_invoke_broadcast_transfer(int client_app, if (NULL == coll) { ret = ENOMEM; } else { - ret = collective_forward(coll); - if (ret == UNIFYFS_SUCCESS) { - ret = invoke_bcast_progress_rpc(coll); + int rc = collective_forward(coll); + if (rc == UNIFYFS_SUCCESS) { + coll->app_id = client_app; + coll->client_id = client_id; + coll->client_req_id = transfer_id; + req->req_type = rpc; + req->coll = coll; + req->handle = HG_HANDLE_NULL; + req->input = (void*) in; + req->bulk_buf = NULL; + req->bulk_sz = 0; + ret = sm_submit_service_request(req); + if (ret != UNIFYFS_SUCCESS) { + LOGERR("failed to submit coll request to svcmgr"); + } } } } diff --git a/server/src/unifyfs_group_rpc.h b/server/src/unifyfs_group_rpc.h index 9c23c183f..309ec0971 100644 --- a/server/src/unifyfs_group_rpc.h +++ b/server/src/unifyfs_group_rpc.h @@ -17,16 +17,19 @@ #include "unifyfs_global.h" #include "unifyfs_inode.h" -#include "unifyfs_service_manager.h" #include "unifyfs_tree.h" #include "margo_server.h" /* Collective Server RPCs */ /* server collective (coll) request state structure */ -typedef struct { +typedef struct coll_request { server_rpc_e req_type; + int app_id; + int client_id; + int client_req_id; unifyfs_tree_t tree; + hg_handle_t progress_hdl; hg_handle_t resp_hdl; size_t output_sz; /* size of output struct */ void* output; /* output struct (type is dependent on rpc) */ @@ -34,6 +37,7 @@ typedef struct { void* bulk_buf; /* allocated buffer for bulk data */ hg_bulk_t bulk_in; hg_bulk_t bulk_forward; + margo_request progress_req; margo_request* child_reqs; hg_handle_t* child_hdls; } coll_request; @@ -41,6 +45,13 @@ typedef struct { /* set collective output return value to local result value */ void collective_set_local_retval(coll_request* coll_req, int val); +/* finish collective process by waiting for any child responses and + * sending parent response (if applicable) */ +int collective_finish(coll_request* coll_req); + +/* release resources associated with collective */ +void collective_cleanup(coll_request* coll_req); + /** * @brief Progress an ongoing broadcast tree operation * diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index af075ad0d..41185f399 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -1334,7 +1334,8 @@ static int process_transfer_rpc(reqmgr_thrd_t* reqmgr, assert(in != NULL); int transfer_id = in->transfer_id; int gfid = in->gfid; - int mode = in->mode; + int mode = (in->mode == 1 ? SERVER_TRANSFER_MODE_LOCAL + : SERVER_TRANSFER_MODE_OWNER); const char* dest_file = strdup(in->dst_file); margo_free_input(req->handle, in); free(in); @@ -1350,6 +1351,8 @@ static int process_transfer_rpc(reqmgr_thrd_t* reqmgr, LOGERR("unifyfs_fops_transfer() failed"); } + LOGDBG("responding - ret=%d", ret); + /* send rpc response */ unifyfs_transfer_out_t out; out.ret = (int32_t) ret; diff --git a/server/src/unifyfs_service_manager.c b/server/src/unifyfs_service_manager.c index 06073c3ee..a25490e06 100644 --- a/server/src/unifyfs_service_manager.c +++ b/server/src/unifyfs_service_manager.c @@ -551,43 +551,60 @@ int sm_transfer(int client_server, int gfid, int transfer_mode, const char* dest_file, - server_rpc_req_t* bcast_req) + void* bcast_coll) { + int rc; + int ret = UNIFYFS_SUCCESS; + int owner_rank = hash_gfid_to_server(gfid); int is_owner = (owner_rank == glb_pmi_rank); - unifyfs_file_attr_t attrs; - int ret = unifyfs_inode_metaget(gfid, &attrs); - if (ret == UNIFYFS_SUCCESS) { - /* we have local file state */ - LOGDBG("transfer - gfid=%d mode=%d file=%s", - gfid, transfer_mode, dest_file); - transfer_thread_args* tta = calloc(1, sizeof(*tta)); - if (transfer_mode == TRANSFER_MODE_LOCAL) { - /* each server transfers local data to the destination file */ - int rc = create_local_transfers(gfid, dest_file, tta); - if (rc != UNIFYFS_SUCCESS) { - ret = rc; - } else { - /* submit transfer request for processing */ - tta->bcast_req = bcast_req; - tta->client_server = client_server; - tta->client_app = client_app; - tta->client_id = client_id; - tta->transfer_id = transfer_id; - rc = sm_submit_transfer_request(tta); - if (rc != UNIFYFS_SUCCESS) { - ret = rc; - } - } - } else if (is_owner && (transfer_mode == TRANSFER_MODE_OWNER)) { - // TODO: support TRANSFER_MODE_OWNER - ret = UNIFYFS_ERROR_NYI; + transfer_thread_args* tta = calloc(1, sizeof(*tta)); + if (NULL == tta) { + LOGERR("failed to allocate transfer_thread_args for gfid=%d", gfid); + return ENOMEM; + } + tta->dst_file = strdup(dest_file); + tta->gfid = gfid; + tta->bcast_coll = bcast_coll; + tta->client_server = client_server; + tta->client_app = client_app; + tta->client_id = client_id; + tta->transfer_id = transfer_id; + + if (glb_pmi_rank == client_server) { + unifyfs_file_attr_t attrs; + rc = unifyfs_invoke_metaget_rpc(gfid, &attrs); + if (rc == UNIFYFS_SUCCESS) { + tta->file_sz = (size_t) attrs.size; } - if (ret != UNIFYFS_SUCCESS) { - LOGERR("transfer(gfid=%d, mode=%d, file=%s) failed", - gfid, transfer_mode, dest_file); + } + + LOGDBG("transfer - gfid=%d mode=%d file=%s", + gfid, transfer_mode, dest_file); + + if (transfer_mode == SERVER_TRANSFER_MODE_LOCAL) { + /* each server transfers local data to the destination file */ + rc = create_local_transfers(gfid, tta); + if ((rc != UNIFYFS_SUCCESS) && (rc != ENOENT)) { + LOGERR("failed to create local transfers - %s", + unifyfs_rc_enum_description(rc)); + } + + /* submit transfer request for processing */ + rc = sm_submit_transfer_request(tta); + if (rc != UNIFYFS_SUCCESS) { + ret = rc; } + } else if (is_owner && (transfer_mode == SERVER_TRANSFER_MODE_OWNER)) { + // TODO: support SERVER_TRANSFER_MODE_OWNER + ret = UNIFYFS_ERROR_NYI; + } + + if (ret != UNIFYFS_SUCCESS) { + LOGERR("transfer(gfid=%d, mode=%d, file=%s) failed - %s", + gfid, transfer_mode, dest_file, + unifyfs_rc_enum_description(ret)); } return ret; } @@ -615,7 +632,11 @@ int sm_truncate(int gfid, size_t filesize) LOGERR("truncate broadcast failed"); } } + } else if (!is_owner && (ENOENT == ret)) { + /* non-owner is not guaranteed to have local metadata for gfid */ + ret = UNIFYFS_SUCCESS; } + return ret; } @@ -696,6 +717,11 @@ static int spawn_local_transfers(void) /* get next transfer */ tta = (transfer_thread_args*) arraylist_remove(transfers, i); + /* record transfer start time on initiator */ + if (glb_pmi_rank == tta->client_server) { + gettimeofday(&(tta->transfer_time), NULL); + } + /* spawn transfer helper thread */ int rc = pthread_create(&(tta->thrd), NULL, transfer_helper_thread, (void*)tta); @@ -741,7 +767,7 @@ static int complete_local_transfers(void) /* get next transfer */ tta = (transfer_thread_args*) arraylist_remove(transfers, i); - /* spawn transfer helper thread */ + /* join transfer helper thread */ int rc = pthread_join(tta->thrd, NULL); if (rc != 0) { LOGERR("failed to join transfer helper thread for tta=%p", tta); @@ -749,9 +775,22 @@ static int complete_local_transfers(void) } if (glb_pmi_rank == tta->client_server) { + struct timeval s_time = tta->transfer_time; + struct timeval e_time; + gettimeofday(&e_time, NULL); + int tv_sec = e_time.tv_sec - s_time.tv_sec; + if (e_time.tv_usec < s_time.tv_usec) { + tv_sec -= 1; + e_time.tv_usec += 1000000; + } + int tv_usec = e_time.tv_usec - s_time.tv_usec; + rc = invoke_client_transfer_complete_rpc(tta->client_app, tta->client_id, tta->transfer_id, + tta->file_sz, + tv_sec, + tv_usec, tta->status); if (rc != 0) { LOGERR("failed transfer(id=%d) complete rpc to client[%d:%d]", @@ -1204,18 +1243,23 @@ static int process_transfer_bcast_rpc(server_rpc_req_t* req) int transfer_mode = (int) in->mode; const char* dest_file = (const char*) in->dst_file; - LOGDBG("gfid=%d file=%s", gfid, dest_file); + LOGDBG("gfid=%d dest_file=%s", gfid, dest_file); /* do file transfer */ int ret = sm_transfer(src_rank, -1, -1, -1, gfid, transfer_mode, - dest_file, req); + dest_file, req->coll); if (UNIFYFS_SUCCESS != ret) { /* submission of transfer request failed */ + LOGERR("sm_transfer() failed for gfid=%d - rc=%d", + gfid, ret); collective_set_local_retval(req->coll, ret); - /* create a ULT to finish broadcast operation */ - ret = invoke_bcast_progress_rpc(req->coll); + /* create a ULT to finish broadcast operation */ + invoke_bcast_progress_rpc(req->coll); } + /* when sm_transfer() returns SUCCESS, we assume a transfer thread will + * be spawned, and that it will progress the broadcast collective after + * completing its data transfers */ return ret; } diff --git a/server/src/unifyfs_service_manager.h b/server/src/unifyfs_service_manager.h index 880567048..18004d67b 100644 --- a/server/src/unifyfs_service_manager.h +++ b/server/src/unifyfs_service_manager.h @@ -96,7 +96,7 @@ int sm_transfer(int client_server, int gfid, int transfer_mode, const char* dest_file, - server_rpc_req_t* bcast_req); + void* bcast_coll); int sm_truncate(int gfid, size_t filesize); diff --git a/server/src/unifyfs_transfer.c b/server/src/unifyfs_transfer.c index 2c117970e..8dbd6d06a 100644 --- a/server/src/unifyfs_transfer.c +++ b/server/src/unifyfs_transfer.c @@ -14,7 +14,7 @@ #include "unifyfs_inode.h" -#include "unifyfs_group_rpc.h" + #include "unifyfs_service_manager.h" #include "unifyfs_transfer.h" #include @@ -83,6 +83,9 @@ static int read_local_extent(extent_metadata* ext, if (NULL != app_clnt) { logio_context* logio_ctx = app_clnt->state.logio_ctx; if (NULL != logio_ctx) { + LOGDBG("reading extent(file_offset=%zu, sz=%zu) from log[%d:%d]", + (size_t)chk->file_offset, chk->chunk_sz, app_id, cli_id); + size_t nread = 0; int rc = unifyfs_logio_read(logio_ctx, log_offset, chk->chunk_sz, buf, &nread); @@ -106,64 +109,68 @@ static int read_local_extent(extent_metadata* ext, /* find local extents for the given gfid and initialize transfer helper * thread state */ int create_local_transfers(int gfid, - const char* dest_file, transfer_thread_args* tta) { - if ((NULL == dest_file) || (NULL == tta)) { + int ret = UNIFYFS_SUCCESS; + + if (NULL == tta) { return EINVAL; } + size_t n_local_extents = 0; + size_t total_local_data_sz = 0; + size_t n_extents = 0; extent_metadata* extents = NULL; int rc = unifyfs_inode_get_extents(gfid, &n_extents, &extents); if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to get extents from inode for gfid=%d", gfid); - return rc; - } else if (n_extents == 0) { - return UNIFYFS_SUCCESS; - } - - /* determine local extents */ - extent_metadata* ext; - size_t n_local_extents = 0; - size_t total_local_data_sz = 0; - for (size_t i = 0; i < n_extents; i++) { - ext = extents + i; - if (glb_pmi_rank == ext->svr_rank) { - total_local_data_sz += extent_length(ext); - n_local_extents++; + if (rc != ENOENT) { + LOGERR("failed to get extents from inode for gfid=%d", gfid); + } + ret = rc; + } else { + /* determine local extents */ + extent_metadata* ext; + for (size_t i = 0; i < n_extents; i++) { + ext = extents + i; + if (glb_pmi_rank == ext->svr_rank) { + total_local_data_sz += extent_length(ext); + n_local_extents++; + } } - } - /* make an array of local extents */ - extent_metadata* local_extents = (extent_metadata*) - calloc(n_local_extents, sizeof(extent_metadata)); - if (NULL == local_extents) { - LOGERR("failed to allocate local extents for gfid=%d", gfid); - free(extents); - return ENOMEM; - } + if (n_local_extents) { + /* make an array of local extents */ + extent_metadata* local_extents = (extent_metadata*) + calloc(n_local_extents, sizeof(extent_metadata)); + if (NULL == local_extents) { + LOGERR("failed to allocate local extents for gfid=%d", gfid); + ret = ENOMEM; + } else { + extent_metadata* dst_ext; + size_t ext_ndx = 0; + for (size_t i = 0; i < n_extents; i++) { + ext = extents + i; + if (glb_pmi_rank == ext->svr_rank) { + dst_ext = local_extents + ext_ndx; + ext_ndx++; + memcpy(dst_ext, ext, sizeof(*ext)); + } + } + } - extent_metadata* dst_ext; - size_t ext_ndx = 0; - for (size_t i = 0; i < n_extents; i++) { - ext = extents + i; - if (glb_pmi_rank == ext->svr_rank) { - dst_ext = local_extents + ext_ndx; - ext_ndx++; - memcpy(dst_ext, ext, sizeof(*ext)); + tta->local_extents = local_extents; } - } - free(extents); + if (NULL != extents) { + free(extents); + } + } - tta->dst_file = strdup(dest_file); - tta->gfid = gfid; - tta->local_extents = local_extents; tta->n_extents = n_local_extents; tta->local_data_sz = total_local_data_sz; - return UNIFYFS_SUCCESS; + return ret; } void release_transfer_thread_args(transfer_thread_args* tta) @@ -175,6 +182,7 @@ void release_transfer_thread_args(transfer_thread_args* tta) if (NULL != tta->dst_file) { free((char*)(tta->dst_file)); } + memset(tta, 0, sizeof(*tta)); free(tta); } } @@ -184,114 +192,133 @@ void* transfer_helper_thread(void* arg) transfer_thread_args* tta = (transfer_thread_args*)arg; assert(NULL != arg); + int fd = -1; int rc; int ret = UNIFYFS_SUCCESS; + coll_request* coll = NULL; char* data_copy_buf = NULL; transfer_chunk* chunks = NULL; - extent_metadata* ext; transfer_chunk* chk; + extent_metadata* ext; - LOGDBG("I am transfer thread for gfid=%d file=%s", - tta->gfid, tta->dst_file); - - /* open destination file (create if it doesn't exist) */ - int flags = O_CREAT | O_WRONLY; - int mode = 0640; - int fd = open(tta->dst_file, flags, mode); - if (fd == -1) { - int err = errno; - LOGERR("failed to open(%s) - %s", tta->dst_file, strerror(err)); - tta->status = err; - return arg; + if (NULL != tta->bcast_coll) { + coll = (coll_request*) tta->bcast_coll; + tta->client_app = coll->app_id; + tta->client_id = coll->client_id; + tta->transfer_id = coll->client_req_id; } - /* get number of local extents and their total size */ - size_t total_local_data_sz = tta->local_data_sz; - size_t n_extents = tta->n_extents; + LOGDBG("I am transfer thread for gfid=%d file=%s collective=%p", + tta->gfid, tta->dst_file, coll); - /* allocate transfer_chunk array */ - chunks = calloc(n_extents, sizeof(transfer_chunk)); - if (NULL == chunks) { - LOGERR("failed to allocate transfer chunk state"); - ret = ENOMEM; - goto transfer_cleanup; - } + if (tta->local_data_sz) { + /* open destination file (create if it doesn't exist) */ + int flags = O_CREAT | O_WRONLY; + int mode = 0640; + fd = open(tta->dst_file, flags, mode); + if (fd == -1) { + int err = errno; + LOGERR("failed to open(%s) - %s", tta->dst_file, strerror(err)); + tta->status = err; + return arg; + } - /* allocate copy buffer for chunk data */ - size_t max_buffer = UNIFYFS_TRANSFER_MAX_BUFFER; - size_t buf_sz = max_buffer; - if (total_local_data_sz <= max_buffer) { - buf_sz = total_local_data_sz; - } else { - /* make sure longest extent will fit in copy buffer */ - for (size_t i = 0; i < n_extents; i++) { - ext = tta->local_extents + i; - size_t ext_sz = extent_length(ext); - if (ext_sz > buf_sz) { - buf_sz = ext_sz; + /* get number of local extents and their total size */ + size_t total_local_data_sz = tta->local_data_sz; + size_t n_extents = tta->n_extents; + + /* allocate transfer_chunk array */ + chunks = calloc(n_extents, sizeof(transfer_chunk)); + if (NULL == chunks) { + LOGERR("failed to allocate transfer chunk state"); + ret = ENOMEM; + goto transfer_cleanup; + } + + /* allocate copy buffer for chunk data */ + size_t max_buffer = UNIFYFS_TRANSFER_MAX_BUFFER; + size_t buf_sz = max_buffer; + if (total_local_data_sz <= max_buffer) { + buf_sz = total_local_data_sz; + } else { + /* make sure longest extent will fit in copy buffer */ + for (size_t i = 0; i < n_extents; i++) { + ext = tta->local_extents + i; + size_t ext_sz = extent_length(ext); + if (ext_sz > buf_sz) { + buf_sz = ext_sz; + } } } - } - data_copy_buf = malloc(buf_sz); - if (NULL == data_copy_buf) { - LOGERR("failed to allocate transfer copy buffer"); - ret = ENOMEM; - goto transfer_cleanup; - } + data_copy_buf = malloc(buf_sz); + if (NULL == data_copy_buf) { + LOGERR("failed to allocate transfer copy buffer"); + ret = ENOMEM; + goto transfer_cleanup; + } - /* read local data for all extents and write it to corresponding - * offsets within destination file. */ - size_t ext_ndx = 0; /* tracks extent array index */ - size_t chk_ndx = 0; /* tracks chunk array index */ - do { - size_t begin_chk_ndx = chk_ndx; - size_t copy_sz = 0; - for (size_t i = ext_ndx; i < n_extents; i++) { - ext = tta->local_extents + i; - size_t ext_sz = extent_length(ext); - if ((copy_sz + ext_sz) <= buf_sz) { - chk = chunks + chk_ndx; - chk_ndx++; - ext_ndx++; - - chk->chunk_data = data_copy_buf + copy_sz; - copy_sz += ext_sz; - - rc = read_local_extent(ext, chk); + /* read local data for all extents and write it to corresponding + * offsets within destination file. */ + size_t ext_ndx = 0; /* tracks extent array index */ + size_t chk_ndx = 0; /* tracks chunk array index */ + do { + size_t begin_chk_ndx = chk_ndx; + size_t copy_sz = 0; + for (size_t i = ext_ndx; i < n_extents; i++) { + ext = tta->local_extents + i; + size_t ext_sz = extent_length(ext); + if ((copy_sz + ext_sz) <= buf_sz) { + chk = chunks + chk_ndx; + chk_ndx++; + ext_ndx++; + + chk->chunk_data = data_copy_buf + copy_sz; + copy_sz += ext_sz; + + rc = read_local_extent(ext, chk); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("failed to copy extent[%zu] data for gfid=%d", + i, tta->gfid); + ret = rc; + goto transfer_cleanup; + } + } else { + /* no room left in copy buffer */ + break; + } + } + + /* write out data chunks for extents processed in this iteration */ + for (size_t i = begin_chk_ndx; i < chk_ndx; i++) { + chk = chunks + i; + rc = write_transfer_chunk(fd, chk); if (rc != UNIFYFS_SUCCESS) { - LOGERR("failed to copy extent[%zu] data for gfid=%d", - i, tta->gfid); + LOGERR("write_transfer_chunk(dst=%s, chk=%zu) failed", + tta->dst_file, i); ret = rc; goto transfer_cleanup; } - } else { - /* no room left in copy buffer */ - break; - } - } - - /* write out data chunks for extents processed in this iteration */ - for (size_t i = begin_chk_ndx; i < chk_ndx; i++) { - chk = chunks + i; - rc = write_transfer_chunk(fd, chk); - if (rc != UNIFYFS_SUCCESS) { - LOGERR("write_transfer_chunk(dst=%s, chk=%zu) failed", - tta->dst_file, i); - ret = rc; - goto transfer_cleanup; } - } - } while (ext_ndx < n_extents); + } while (ext_ndx < n_extents); + } transfer_cleanup: - close(fd); + if (-1 != fd) { + close(fd); + } + tta->status = ret; - if (NULL != tta->bcast_req) { - /* create a ULT to finish broadcast operation */ - collective_set_local_retval(tta->bcast_req->coll, ret); - invoke_bcast_progress_rpc(tta->bcast_req->coll); + if (NULL != coll) { + /* finish broadcast collective operation */ + collective_set_local_retval(coll, ret); + rc = collective_finish(coll); + if (rc != UNIFYFS_SUCCESS) { + LOGERR("collective_finish() failed for collective(%p) (rc=%d)", + coll, ret); + } + collective_cleanup(coll); } LOGDBG("signaling transfer completion"); diff --git a/server/src/unifyfs_transfer.h b/server/src/unifyfs_transfer.h index 91cb65390..7f356dbff 100644 --- a/server/src/unifyfs_transfer.h +++ b/server/src/unifyfs_transfer.h @@ -16,12 +16,8 @@ #define UNIFYFS_TRANSFER_H #include "unifyfs_global.h" +#include "unifyfs_group_rpc.h" -/* server transfer modes */ -typedef enum { - TRANSFER_MODE_OWNER = 0, /* owner transfers all data */ - TRANSFER_MODE_LOCAL = 1 /* each server transfers local data */ -} transfer_mode_e; /* transfer helper thread arguments structure */ typedef struct transfer_thread_args { @@ -38,9 +34,12 @@ typedef struct transfer_thread_args { extent_metadata* local_extents; size_t n_extents; - size_t local_data_sz; /* total size of local data */ + size_t local_data_sz; /* total size of local data in bytes */ + size_t file_sz; /* source file size in bytes */ - server_rpc_req_t* bcast_req; /* bcast rpc req state */ + struct timeval transfer_time; /* elapsed transfer time */ + + coll_request* bcast_coll; /* bcast rpc collective req state */ int status; /* status for entire set of transfers */ pthread_t thrd; /* pthread id for transfer helper thread */ @@ -51,7 +50,6 @@ void release_transfer_thread_args(transfer_thread_args* tta); /* find local extents for the given gfid and initialize transfer helper * thread state */ int create_local_transfers(int gfid, - const char* dest_file, transfer_thread_args* tta); /** diff --git a/server/src/unifyfs_tree.c b/server/src/unifyfs_tree.c index 5fa3b3c4f..741e7823d 100644 --- a/server/src/unifyfs_tree.c +++ b/server/src/unifyfs_tree.c @@ -130,8 +130,11 @@ int unifyfs_tree_init( void unifyfs_tree_free(unifyfs_tree_t* t) { /* free child rank list */ - free(t->child_ranks); - t->child_ranks = NULL; + if (NULL != t->child_ranks) { + free(t->child_ranks); + t->child_ranks = NULL; + } + memset(t, 0, sizeof(unifyfs_tree_t)); return; } diff --git a/t/0700-unifyfs-stage-full.t b/t/0700-unifyfs-stage-full.t index c25ad0e72..228d59ac6 100755 --- a/t/0700-unifyfs-stage-full.t +++ b/t/0700-unifyfs-stage-full.t @@ -60,9 +60,9 @@ stage_in_log=$stage_cfg_dir/stage_IN.log stage_out_log=$stage_cfg_dir/stage_OUT.log stage_exe=${UNIFYFS_BUILD_DIR}/util/unifyfs-stage/src/unifyfs-stage -$JOB_RUN_COMMAND $stage_exe -v -m ${UNIFYFS_TEST_MOUNT} $stage_in_manifest &> $stage_in_log +$JOB_RUN_COMMAND $stage_exe -v -m ${UNIFYFS_TEST_MOUNT} -S $stage_cfg_dir $stage_in_manifest &> $stage_in_log -$JOB_RUN_COMMAND $stage_exe -v -m ${UNIFYFS_TEST_MOUNT} $stage_out_manifest &> $stage_out_log +$JOB_RUN_COMMAND $stage_exe -v -m ${UNIFYFS_TEST_MOUNT} -S $stage_cfg_dir $stage_out_manifest &> $stage_out_log test_expect_success "input file has been staged to output" ' test_path_is_file $stage_dst_file diff --git a/t/9300-unifyfs-stage-isolated.t b/t/9300-unifyfs-stage-isolated.t deleted file mode 100755 index 92544f741..000000000 --- a/t/9300-unifyfs-stage-isolated.t +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# -# Test unifyfs-stage executable for basic functionality -# - -test_description="Test basic functionality of unifyfs-stage executable" - -. $(dirname $0)/sharness.sh - -test_expect_success "unifyfs-stage exists" ' - test_path_is_file ${UNIFYFS_BUILD_DIR}/util/unifyfs-stage/src/unifyfs-stage -' -test_expect_success "testing temp dir exists" ' - test_path_is_dir ${UNIFYFS_TEST_TMPDIR} -' - -stage_cfg_dir=${UNIFYFS_TEST_TMPDIR}/stage/config_9300 -stage_src_dir=${UNIFYFS_TEST_TMPDIR}/stage/source -stage_dst_dir=${UNIFYFS_TEST_TMPDIR}/stage/destination_9300 -mkdir -p $stage_cfg_dir $stage_src_dir $stage_dst_dir - -test_expect_success "stage testing dirs exist" ' - test_path_is_dir $stage_cfg_dir && - test_path_is_dir $stage_src_dir && - test_path_is_dir $stage_dst_dir -' - -stage_src_file=$stage_src_dir/source_9300.file -stage_dst_file=$stage_dst_dir/destination_9300.file - -rm -f $stage_cfg_dir/* $stage_dst_dir/* - -test_expect_success "config_9300 directory is empty" ' - test_dir_is_empty $stage_cfg_dir -' - -# NOTE: we're using the unifyfs-stage binary as its own transfer data target -# because we know it's there and it's filled with non-zero data. -stage_exe=${UNIFYFS_BUILD_DIR}/util/unifyfs-stage/src/unifyfs-stage -cp $stage_exe $stage_src_file - -test_expect_success "source.file exists" ' - test_path_is_file $stage_src_file -' - -stage_manifest=$stage_cfg_dir/stage.manifest -echo "\"$stage_src_file\" \"$stage_dst_file\"" > $stage_manifest - -test_expect_success "config_9300 directory now has manifest file" ' - test_path_is_file $stage_manifest -' - -test_expect_success "target directory is empty" ' - test_dir_is_empty $stage_dst_dir -' - -stage_log=$stage_cfg_dir/stage.log -$JOB_RUN_COMMAND $stage_exe -N $stage_manifest &> $stage_log - -test_expect_success "input file has been staged to output" ' - test_path_is_file $stage_dst_file -' - -export TEST_CMP='cmp --quiet' - -test_expect_success "final output is identical to initial input" ' - test_cmp $stage_src_file $stage_dst_file -' - -test_done diff --git a/t/Makefile.am b/t/Makefile.am index 049fbc556..2c8b31816 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -24,7 +24,6 @@ TESTS += \ 9020-mountpoint-empty.t \ 9200-seg-tree-test.t \ 9201-slotmap-test.t \ - 9300-unifyfs-stage-isolated.t \ 9999-cleanup.t check_SCRIPTS = $(TESTS) diff --git a/t/api/laminate.c b/t/api/laminate.c index 931065042..e908edca7 100644 --- a/t/api/laminate.c +++ b/t/api/laminate.c @@ -85,7 +85,7 @@ int api_laminate_test(char* unifyfs_root, __FILE__, __LINE__, testfile, rc, unifyfs_rc_enum_description(rc)); /* (2) stat testfile, should report not laminated */ - unifyfs_status status; + unifyfs_file_status status; memset(&status, 0, sizeof(status)); rc = unifyfs_stat(*fshdl, gfid, &status); diff --git a/t/api/storage-reuse.c b/t/api/storage-reuse.c index c71357bf8..1a16e4f69 100644 --- a/t/api/storage-reuse.c +++ b/t/api/storage-reuse.c @@ -104,7 +104,7 @@ int api_storage_test(char* unifyfs_root, /* (3) stat testfile1 to verify size */ - unifyfs_status t1_status = {0}; + unifyfs_file_status t1_status = {0}; rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); /* expected size=filesize since writes have been synced */ ok((rc == UNIFYFS_SUCCESS) && (t1_status.global_file_size == filesize), @@ -210,7 +210,7 @@ int api_storage_test(char* unifyfs_root, /* (8) stat testfile2 to verify size */ - unifyfs_status t2_status = {0}; + unifyfs_file_status t2_status = {0}; rc = unifyfs_stat(*fshdl, t2_gfid, &t2_status); /* expected size=filesize since writes have been synced */ ok((rc == UNIFYFS_SUCCESS) && (t2_status.global_file_size == filesize), diff --git a/t/api/transfer.c b/t/api/transfer.c index 70fbf3872..98529d905 100644 --- a/t/api/transfer.c +++ b/t/api/transfer.c @@ -107,7 +107,7 @@ int api_transfer_test(char* unifyfs_root, /* (3) stat source file to verify size */ - unifyfs_status t1_status = {0}; + unifyfs_file_status t1_status = {0}; rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); /* expected size=filesize since writes have been synced */ ok((rc == UNIFYFS_SUCCESS) && (t1_status.global_file_size == filesize), diff --git a/t/api/write-read-sync-stat.c b/t/api/write-read-sync-stat.c index 3e72fa993..38717aefc 100644 --- a/t/api/write-read-sync-stat.c +++ b/t/api/write-read-sync-stat.c @@ -159,7 +159,7 @@ int api_write_read_sync_stat_test(char* unifyfs_root, __FILE__, __LINE__, testfile3, rc, unifyfs_rc_enum_description(rc)); /* (4) stat all files */ - unifyfs_status t1_status, t2_status, t3_status; + unifyfs_file_status t1_status, t2_status, t3_status; rc = unifyfs_stat(*fshdl, t1_gfid, &t1_status); /* expected size=filesize since writes have been synced */ diff --git a/t/ci/800-stage-tests.sh b/t/ci/800-stage-tests.sh index 9e6b72c7a..6145bdd48 100755 --- a/t/ci/800-stage-tests.sh +++ b/t/ci/800-stage-tests.sh @@ -109,15 +109,18 @@ test_expect_success "config directory now has manifest files" ' STAGE_IN_ERR=${STAGE_LOG_DIR}/stage_IN_800_${STAGE_FILE_CFG}.err STAGE_OUT_ERR=${STAGE_LOG_DIR}/stage_OUT_800_${STAGE_FILE_CFG}.err +STAGE_IN_STATUS=${STAGE_LOG_DIR}/stage_IN_800_${STAGE_FILE_CFG}.status +STAGE_OUT_STATUS=${STAGE_LOG_DIR}/stage_OUT_800_${STAGE_FILE_CFG}.status + # run and time the stage-in operation. -STAGEIN_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_IN_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c ${MAN_IN}" +STAGEIN_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_IN_ERR $STAGE_EXE -S ${STAGE_IN_STATUS} -m ${UNIFYFS_MP} -v -c ${MAN_IN}" echo "stagein_command: ${STAGEIN_COMMAND}" TIME_IN_START=`date +%s` my_stagein_output="$($STAGEIN_COMMAND)" TIME_IN_END=`date +%s` # run and time the stage-out operation. -STAGEOUT_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_OUT_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c ${MAN_OUT}" +STAGEOUT_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_OUT_ERR $STAGE_EXE -S ${STAGE_OUT_STATUS} -m ${UNIFYFS_MP} -v -c ${MAN_OUT}" echo "stageOUT_command: ${STAGEOUT_COMMAND}" TIME_OUT_START=`date +%s` my_stageout_output="$($STAGEOUT_COMMAND)" @@ -166,84 +169,84 @@ fi ##### Parallel unifyfs-stage tests ##### -# TODO: Remove if block once parallel transfer logic is working -if test_have_prereq PARALLEL_TRANSFER_FIXED; then - # set up what the intermediate filename will be within UnifyFS and also - # the final file name after copying it back out. Then use those - # filenames to create the two manifest files, one for copying the - # file in, and one for copying the file out. - STAGE_IM_FILE=${UNIFYFS_MP}/intermediate_parallel_${STAGE_FILE_CFG}.file - STAGE_P_DST_FILE=${STAGE_DST_DIR}/destination_parallel_800_${STAGE_FILE_CFG}.file - MAN_IN=${STAGE_CFG_DIR}/stage_IN_parallel_${STAGE_FILE_CFG}.manifest - MAN_OUT=${STAGE_CFG_DIR}/stage_OUT_parallel_${STAGE_FILE_CFG}.manifest - echo "\"${STAGE_SRC_FILE}\" \"${STAGE_IM_FILE}\"" > ${MAN_IN} - echo "\"${STAGE_IM_FILE}\" \"${STAGE_P_DST_FILE}\"" > ${MAN_OUT} - - test_expect_success "config directory now has parallel manifest files" ' - test_path_is_file $MAN_IN && - test_path_is_file $MAN_OUT - ' - - STAGE_IN_ERR=${STAGE_LOG_DIR}/stage_IN_parallel_800_${STAGE_FILE_CFG}.err - STAGE_OUT_ERR=${STAGE_LOG_DIR}/stage_OUT_parallel_800_${STAGE_FILE_CFG}.err - - # run and time the stage-in operation. - STAGEIN_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_IN_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c -p ${MAN_IN}" - echo "stagein_command: ${STAGEIN_COMMAND}" - TIME_IN_START=`date +%s` - my_stagein_output="$($STAGEIN_COMMAND)" - TIME_IN_END=`date +%s` - - # run and time the stage-out operation. - STAGEOUT_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_OUT_ERR $STAGE_EXE -m ${UNIFYFS_MP} -v -c -p ${MAN_OUT}" - echo "stageOUT_command: ${STAGEOUT_COMMAND}" - TIME_OUT_START=`date +%s` - my_stageout_output="$($STAGEOUT_COMMAND)" - TIME_OUT_END=`date +%s` - - STAGE_IN_LOG=${STAGE_LOG_DIR}/stage_IN_parallel_800_${STAGE_FILE_CFG}.out - STAGE_OUT_LOG=${STAGE_LOG_DIR}/stage_OUT_parallel_800_${STAGE_FILE_CFG}.out - echo $my_stagein_output > $STAGE_IN_LOG - echo $my_stageout_output > $STAGE_OUT_LOG - - ELAPSED_TIME_IN=$(( ${TIME_IN_END} - ${TIME_IN_START} )) - ELAPSED_TIME_OUT=$(( ${TIME_OUT_END} - ${TIME_OUT_START} )) - echo "time to stage in parallel: $ELAPSED_TIME_IN s" - echo "time to stage out parallel: $ELAPSED_TIME_OUT s" - - test_expect_success "parallel: input file has been staged to output" ' - test_path_is_file $STAGE_P_DST_FILE - ' - - # This block is used to indirectly get the test result back to us - # of whether the file comparison failed, so that we can put - # that result in the line of the timing file. - SUCCESS_TOTAL_BEFORE=${test_success} - test_expect_success "parallel: final output is identical to initial input" ' - test_cmp $STAGE_P_DST_FILE $STAGE_SRC_FILE - ' - SUCCESS_TOTAL_AFTER=${test_success} - - # If the success total is *different*, then the final test - # (the file comparison after to before) passed. - # If they're the same, then it failed. - if [ ${SUCCESS_TOTAL_BEFORE} == ${SUCCESS_TOTAL_AFTER} ]; then - echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ - ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ - ^${STAGE_TEST_OVERALL_CONFIG}^ \ - @${STAGE_TEST_SPECIFIC_CONFIG}@ %FAIL%" \ - >> ${STAGE_LOG_DIR}/timings_parallel_${JOB_ID}.dat - else - echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ - ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ - ^${STAGE_TEST_OVERALL_CONFIG}^ \ - @${STAGE_TEST_SPECIFIC_CONFIG}@ %GOOD%" \ - >> ${STAGE_LOG_DIR}/timings_parallel_${JOB_ID}.dat - fi - - test_expect_success "serial output is identical to parallel output" ' - test_cmp $STAGE_DST_FILE $STAGE_P_DST_FILE - ' +# set up what the intermediate filename will be within UnifyFS and also +# the final file name after copying it back out. Then use those +# filenames to create the two manifest files, one for copying the +# file in, and one for copying the file out. +STAGE_P_IM_FILE=${UNIFYFS_MP}/intermediate_parallel_${STAGE_FILE_CFG}.file +STAGE_P_DST_FILE=${STAGE_DST_DIR}/destination_parallel_800_${STAGE_FILE_CFG}.file +MAN_IN=${STAGE_CFG_DIR}/stage_IN_parallel_${STAGE_FILE_CFG}.manifest +MAN_OUT=${STAGE_CFG_DIR}/stage_OUT_parallel_${STAGE_FILE_CFG}.manifest +echo "\"${STAGE_SRC_FILE}\" \"${STAGE_P_IM_FILE}\"" > ${MAN_IN} +echo "\"${STAGE_P_IM_FILE}\" \"${STAGE_P_DST_FILE}\"" > ${MAN_OUT} + +test_expect_success "config directory now has parallel manifest files" ' + test_path_is_file $MAN_IN && + test_path_is_file $MAN_OUT +' + +STAGE_IN_ERR=${STAGE_LOG_DIR}/stage_IN_parallel_800_${STAGE_FILE_CFG}.err +STAGE_OUT_ERR=${STAGE_LOG_DIR}/stage_OUT_parallel_800_${STAGE_FILE_CFG}.err + +STAGE_IN_STATUS=${STAGE_LOG_DIR}/stage_IN_parallel_800_${STAGE_FILE_CFG}.status +STAGE_OUT_STATUS=${STAGE_LOG_DIR}/stage_OUT_parallel_800_${STAGE_FILE_CFG}.status + +# run and time the stage-in operation. +STAGEIN_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_IN_ERR $STAGE_EXE -S ${STAGE_IN_STATUS} -m ${UNIFYFS_MP} -v -c -p ${MAN_IN}" +echo "stagein_command: ${STAGEIN_COMMAND}" +TIME_IN_START=`date +%s` +my_stagein_output="$($STAGEIN_COMMAND)" +TIME_IN_END=`date +%s` + +# run and time the stage-out operation. +STAGEOUT_COMMAND="${JOB_RUN_COMMAND} $app_err $STAGE_OUT_ERR $STAGE_EXE -S ${STAGE_OUT_STATUS} -m ${UNIFYFS_MP} -v -c -p ${MAN_OUT}" +echo "stageOUT_command: ${STAGEOUT_COMMAND}" +TIME_OUT_START=`date +%s` +my_stageout_output="$($STAGEOUT_COMMAND)" +TIME_OUT_END=`date +%s` + +STAGE_IN_LOG=${STAGE_LOG_DIR}/stage_IN_parallel_800_${STAGE_FILE_CFG}.out +STAGE_OUT_LOG=${STAGE_LOG_DIR}/stage_OUT_parallel_800_${STAGE_FILE_CFG}.out +echo $my_stagein_output > $STAGE_IN_LOG +echo $my_stageout_output > $STAGE_OUT_LOG + +ELAPSED_TIME_IN=$(( ${TIME_IN_END} - ${TIME_IN_START} )) +ELAPSED_TIME_OUT=$(( ${TIME_OUT_END} - ${TIME_OUT_START} )) +echo "time to stage in parallel: $ELAPSED_TIME_IN s" +echo "time to stage out parallel: $ELAPSED_TIME_OUT s" + +test_expect_success "parallel: input file has been staged to output" ' + test_path_is_file $STAGE_P_DST_FILE +' + +# This block is used to indirectly get the test result back to us +# of whether the file comparison failed, so that we can put +# that result in the line of the timing file. +SUCCESS_TOTAL_BEFORE=${test_success} +test_expect_success "parallel: final output is identical to initial input" ' + test_cmp $STAGE_P_DST_FILE $STAGE_SRC_FILE +' +SUCCESS_TOTAL_AFTER=${test_success} + +# If the success total is *different*, then the final test +# (the file comparison after to before) passed. +# If they're the same, then it failed. +if [ ${SUCCESS_TOTAL_BEFORE} == ${SUCCESS_TOTAL_AFTER} ]; then + echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ + ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ + ^${STAGE_TEST_OVERALL_CONFIG}^ \ + @${STAGE_TEST_SPECIFIC_CONFIG}@ %FAIL%" \ + >> ${STAGE_LOG_DIR}/timings_parallel_${JOB_ID}.dat +else + echo "${STAGE_TEST_INDEX} ${STAGE_FILE_SIZE_IN_MB} \ + ${ELAPSED_TIME_IN} ${ELAPSED_TIME_OUT} \ + ^${STAGE_TEST_OVERALL_CONFIG}^ \ + @${STAGE_TEST_SPECIFIC_CONFIG}@ %GOOD%" \ + >> ${STAGE_LOG_DIR}/timings_parallel_${JOB_ID}.dat fi +test_expect_success "serial output is identical to parallel output" ' + test_cmp $STAGE_DST_FILE $STAGE_P_DST_FILE +' + rm -f $STAGE_SRC_FILE diff --git a/util/unifyfs-stage/src/Makefile.am b/util/unifyfs-stage/src/Makefile.am index b8127ddbe..4221c55f5 100644 --- a/util/unifyfs-stage/src/Makefile.am +++ b/util/unifyfs-stage/src/Makefile.am @@ -13,12 +13,7 @@ stage_cppflags = $(AM_CPPFLAGS) $(MPI_CFLAGS) \ -I$(top_srcdir)/client/src \ -I$(top_srcdir)/common/src -if USE_PMPI_WRAPPERS -stage_cppflags += -DENABLE_MPI_MOUNT -stage_unify_lib = $(top_builddir)/client/src/libunifyfs_mpi.la -else -stage_unify_lib = $(top_builddir)/client/src/libunifyfs.la -endif +stage_unify_lib = $(top_builddir)/client/src/libunifyfs_api.la stage_ldadd = \ $(stage_unify_lib) \ @@ -28,7 +23,6 @@ stage_ldadd = \ stage_ldflags = \ $(AM_LDFLAGS) \ - $(CP_WRAPPERS) \ -static # Per-target flags begin here diff --git a/util/unifyfs-stage/src/unifyfs-stage-transfer.c b/util/unifyfs-stage/src/unifyfs-stage-transfer.c index 2273affad..7c3914a16 100644 --- a/util/unifyfs-stage/src/unifyfs-stage-transfer.c +++ b/util/unifyfs-stage/src/unifyfs-stage-transfer.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Copyright (c) 2022, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2020, UT-Battelle, LLC. + * Copyright 2022, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,29 +11,138 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ + #include +#include +#include #include #include -#include #include -#include -#include -#include #include #include #include #include -#include -#include -#include -#include -#include + #include #include #include "unifyfs-stage.h" +static +int read_unify_file_block(unifyfs_handle fshdl, + unifyfs_gfid gfid, + off_t file_offset, + size_t bufsize, + char* databuf, + size_t* nread) +{ + int ret = 0; + size_t len = 0; + + unifyfs_io_request rd_req; + rd_req.op = UNIFYFS_IOREQ_OP_READ; + rd_req.gfid = gfid; + rd_req.nbytes = bufsize; + rd_req.offset = file_offset; + rd_req.user_buf = (void*) databuf; + + unifyfs_rc urc = unifyfs_dispatch_io(fshdl, 1, &rd_req); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "unifyfs_dispatch_io(OP_READ) failed - %s", + unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); + } else { + urc = unifyfs_wait_io(fshdl, 1, &rd_req, 1); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "unifyfs_wait_io(OP_READ) failed - %s", + unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); + } else { + if (0 == rd_req.result.error) { + len = rd_req.result.count; + } else { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "OP_READ req failed - %s", + strerror(rd_req.result.error)); + ret = rd_req.result.error; + } + } + } + *nread = len; + return ret; +} + +static +int write_unify_file_block(unifyfs_handle fshdl, + unifyfs_gfid gfid, + off_t file_offset, + size_t bufsize, + char* databuf, + size_t* nwrite) +{ + int ret = 0; + size_t len = 0; + + unifyfs_io_request wr_req; + wr_req.op = UNIFYFS_IOREQ_OP_WRITE; + wr_req.gfid = gfid; + wr_req.nbytes = bufsize; + wr_req.offset = file_offset; + wr_req.user_buf = (void*) databuf; + + unifyfs_rc urc = unifyfs_dispatch_io(fshdl, 1, &wr_req); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "unifyfs_dispatch_io(OP_WRITE) failed - %s", + unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); + } else { + urc = unifyfs_wait_io(fshdl, 1, &wr_req, 1); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "unifyfs_wait_io(OP_WRITE) failed - %s", + unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); + } else { + if (0 == wr_req.result.error) { + len = wr_req.result.count; + } else { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "OP_WRITE req failed - %s", + strerror(wr_req.result.error)); + ret = wr_req.result.error; + } + } + } + *nwrite = len; + return ret; +} + +static +int read_file_block(int fd, + off_t file_offset, + size_t bufsize, + char* databuf, + size_t* nread) +{ + int err; + int ret = 0; + errno = 0; + ssize_t len = pread(fd, (void*) databuf, bufsize, file_offset); + if (-1 == len) { + err = errno; + fprintf(stderr, "UNIFYFS-STAGE ERROR: pread() failed - %s", + strerror(err)); + ret = err; + len = 0; + } + *nread = len; + return ret; +} + /** * @brief Run md5 checksum on specified file, send back * digest. @@ -43,44 +152,83 @@ * * @return 0 on success, errno otherwise */ -static int md5_checksum(const char* path, unsigned char* digest) +static int md5_checksum(unifyfs_stage* ctx, + bool is_unify_file, + const char* path, + unsigned char* digest) { + int err, fd, md5_rc, rc; int ret = 0; + unifyfs_gfid gfid; + unifyfs_rc urc; size_t len = 0; - int fd = -1; - unsigned char data[UNIFYFS_STAGE_MD5_BLOCKSIZE] = { 0, }; + off_t file_offset; MD5_CTX md5; - - fd = open(path, O_RDONLY); - if (fd < 0) { - perror("open"); - return errno; + unsigned char data[UNIFYFS_STAGE_MD5_BLOCKSIZE]; + + if (is_unify_file) { + fd = -1; + urc = unifyfs_open(ctx->fshdl, O_RDONLY, path, &gfid); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "failed to unifyfs_open(%s) - %s\n", + path, unifyfs_rc_enum_description(urc)); + return unifyfs_rc_errno(urc); + } + } else { + errno = 0; + fd = open(path, O_RDONLY); + if (fd < 0) { + err = errno; + fprintf(stderr, "UNIFYFS-STAGE ERROR: failed to open(%s) - %s\n", + path, strerror(err)); + return err; + } } - ret = MD5_Init(&md5); - if (!ret) { - fprintf(stderr, "failed to create md5 context\n"); - goto out; - } + /* NOTE: MD5_xxxx() returns 1 for success */ + md5_rc = MD5_Init(&md5); + if (md5_rc != 1) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: failed to create MD5 context\n"); + ret = EIO; + } else { + file_offset = 0; + do { + len = 0; + memset(data, 0, sizeof(data)); + if (is_unify_file) { + rc = read_unify_file_block(ctx->fshdl, gfid, file_offset, + sizeof(data), (char*)data, &len); + } else { + rc = read_file_block(fd, file_offset, + sizeof(data), (char*)data, &len); + } + if (rc != 0) { + ret = EIO; + break; + } else if (len) { + file_offset += (off_t) len; + md5_rc = MD5_Update(&md5, data, len); + if (md5_rc != 1) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "MD5 checksum update failed\n"); + ret = EIO; + break; + } + } + } while (len != 0); - while ((len = read(fd, (void*) data, UNIFYFS_STAGE_MD5_BLOCKSIZE)) != 0) { - ret = MD5_Update(&md5, data, len); - if (!ret) { - fprintf(stderr, "failed to update checksum\n"); - goto out; + md5_rc = MD5_Final(digest, &md5); + if (md5_rc != 1) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: failed to finalize MD5\n"); + ret = EIO; } } - ret = MD5_Final(digest, &md5); - if (!ret) { - fprintf(stderr, "failed to finalize md5\n"); + if (-1 != fd) { + close(fd); } -out: - /* MD5_xx returns 1 for success */ - ret = (ret == 1 ? 0 : EIO); - close(fd); - return ret; } @@ -109,12 +257,18 @@ static char* checksum_str(char* buf, unsigned char* digest) /** * @brief takes check sums of two files and compares * - * @param src path to one file - * @param dst path to the other file + * @param src_in_unify is source file in UnifyFS? + * @param dst_in_unify is destination file in UnifyFS? + * @param src source file path + * @param dst destination filepath * * @return 0 if files are identical, non-zero if not, or other error */ -static int verify_checksum(const char* src, const char* dst) +static int verify_checksum(unifyfs_stage* ctx, + bool src_in_unify, + bool dst_in_unify, + const char* src, + const char* dst) { int ret = 0; int i = 0; @@ -126,254 +280,316 @@ static int verify_checksum(const char* src, const char* dst) src_digest[MD5_DIGEST_LENGTH] = '\0'; dst_digest[MD5_DIGEST_LENGTH] = '\0'; - ret = md5_checksum(src, src_digest); + ret = md5_checksum(ctx, src_in_unify, src, src_digest); if (ret) { - fprintf(stderr, "failed to calculate checksum for %s (%s)\n", - src, strerror(ret)); + fprintf(stderr, "UNIFYFS-STAGE ERROR: MD5 checksum failure " + "for source file %s\n", + src); return ret; } - ret = md5_checksum(dst, dst_digest); + ret = md5_checksum(ctx, dst_in_unify, dst, dst_digest); if (ret) { - fprintf(stderr, "failed to calculate checksum for %s (%s)\n", - dst, strerror(ret)); + fprintf(stderr, "UNIFYFS-STAGE ERROR: MD5 checksum failure " + "for destination file %s\n", + dst); return ret; } - if (verbose) { - printf("[%d] src: %s, dst: %s\n", rank, - checksum_str(md5src, src_digest), - checksum_str(md5dst, dst_digest)); - } - for (i = 0; i < MD5_DIGEST_LENGTH; i++) { if (src_digest[i] != dst_digest[i]) { - fprintf(stderr, "[%d] checksum verification failed: " - "(src=%s, dst=%s)\n", rank, + fprintf(stderr, "UNIFYFS-STAGE ERROR: checksums do not match! " + "(src=%s, dst=%s)\n", checksum_str(md5src, src_digest), checksum_str(md5dst, dst_digest)); ret = EIO; + break; } } + if (verbose && (0 == ret)) { + printf("UNIFYFS-STAGE INFO: checksums (src=%s, dst=%s)\n", + checksum_str(md5src, src_digest), + checksum_str(md5dst, dst_digest)); + } return ret; } -/* - * Parse a line from the manifest in the form of: - * - * - * - * If the paths have spaces, they must be quoted. - * - * On success, return 0 along with allocated src and dest strings. These - * must be freed when you're finished with them. On failure return non-zero, - * and set src and dest to NULL. - * - * Note, leading and tailing whitespace are ok. They just get ignored. - * Lines with only whitespace are ignored. A line of all whitespace will - * return 0, with src and dest being NULL, so users should not check for - * 'if (*src == NULL)' to see if the function failed. They should be looking - * at the return code. - */ -/** - * @brief parses manifest file line, passes back src and dst strings - * - * @param line input manifest file line - * @param src return val of src filename - * @param dst return val of dst filename - * - * @return 0 if all was well, or there was nothing; non-zero on error - */ -int -unifyfs_parse_manifest_line(char* line, char** src, char** dest) +static +int distribute_source_file_data(unifyfs_stage* ctx, + const char* src_file_path, + const char* dst_file_path, + size_t transfer_blksz, + size_t num_file_blocks) { - char* new_src = NULL; - char* new_dest = NULL; - char* copy; - char* tmp; - unsigned long copy_len; - int i; - unsigned int tmp_count; - int in_quotes = 0; - int rc = 0; - - copy = strdup(line); - copy_len = strlen(copy) + 1;/* +1 for '\0' */ - - /* Replace quotes and separator with '\0' */ - for (i = 0; i < copy_len; i++) { - if (copy[i] == '"') { - in_quotes ^= 1;/* toggle */ - copy[i] = '\0'; - } else if (isspace(copy[i]) && !in_quotes) { - /* - * Allow any whitespace for our separator - */ - copy[i] = '\0'; + int rc; + int fd = -1; + int ret = 0; + unifyfs_gfid gfid; + unifyfs_rc urc; + + size_t blocks_per_client = num_file_blocks / ctx->total_ranks; + if (blocks_per_client < 8) { + /* somewhat arbitrary choice of minimum 8 blocks per client. + * also avoids distribution of small files */ + blocks_per_client = 8; + } + + /* rank 0 creates destination file */ + if (ctx->rank == 0) { + urc = unifyfs_create(ctx->fshdl, O_WRONLY, dst_file_path, &gfid); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "failed to unifyfs_create(%s) - %s\n", + dst_file_path, unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); } } + MPI_Barrier(MPI_COMM_WORLD); + + size_t start_block_ndx = ctx->rank * blocks_per_client; + if (start_block_ndx < num_file_blocks) { + /* open source file */ + errno = 0; + fd = open(src_file_path, O_RDONLY); + if (fd < 0) { + ret = errno; + fprintf(stderr, "UNIFYFS-STAGE ERROR: failed to open(%s) - %s\n", + src_file_path, strerror(ret)); + } - /* - * copy[] now contains a series of strings, one after the other - * (possibly containing some NULL strings, which we ignore) - */ - tmp = copy; - while (tmp < copy + copy_len) { - tmp_count = strlen(tmp); - if (tmp_count > 0) { - /* We have a real string */ - if (!new_src) { - new_src = strdup(tmp); - } else { - if (!new_dest) { - new_dest = strdup(tmp); - } else { - /* Error: a third file name */ - rc = 1; + /* non-zero ranks just open destination file */ + if (ctx->rank != 0) { + urc = unifyfs_open(ctx->fshdl, O_WRONLY, dst_file_path, &gfid); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "failed to unifyfs_open(%s) - %s\n", + dst_file_path, unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); + } + } + + /* make sure all the open/create calls succeeded */ + if (ret) { + goto err_ret; + } + + char* block_data = malloc(transfer_blksz); + if (NULL == block_data) { + ret = ENOMEM; + goto err_ret; + } + + for (size_t i = 0; i < blocks_per_client; i++) { + memset(block_data, 0, transfer_blksz); + size_t block_ndx = start_block_ndx + i; + if (block_ndx < num_file_blocks) { + size_t nread = 0; + off_t block_offset = block_ndx * transfer_blksz; + rc = read_file_block(fd, block_offset, transfer_blksz, + block_data, &nread); + if (rc) { + ret = rc; break; + } else { + size_t nwrite = 0; + rc = write_unify_file_block(ctx->fshdl, gfid, block_offset, + nread, block_data, &nwrite); + if (rc) { + ret = rc; + break; + } else if (verbose && (nread != nwrite)) { + printf("[rank=%d] UNIFYFS-STAGE DEBUG: " + "mismatch on read=%zu / write=%zu bytes\n", + ctx->rank, nread, nwrite); + } } } } - tmp += tmp_count + 1; - } - /* Some kind of error parsing a line */ - if (rc != 0 || (new_src && !new_dest)) { - fprintf(stderr, "manifest file line >>%s<< is invalid!\n", - line); - free(new_src); - free(new_dest); - new_src = NULL; - new_dest = NULL; - if (rc == 0) { - rc = 1; + /* synchronize local writes */ + urc = unifyfs_sync(ctx->fshdl, gfid); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "failed to unifyfs_sync(%s) - %s\n", + dst_file_path, unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); } } - *src = new_src; - *dest = new_dest; +err_ret: + if (fd != -1) { + close(fd); + } - free(copy); - return rc; + return ret; } /** - * @brief controls the action of the stage-in or stage-out. Opens up - * the manifest file, sends each line to be parsed, and fires - * each source/destination to be staged. + * @brief transfer source file to destination according to stage context * - * @param ctx stage context and instructions + * @param ctx stage context + * @param file_index index of file in manifest (>=1) + * @param src_file_size size in bytes of source file + * @param src_file_path source file path + * @param dst_file_path destination file path * - * @return 0 indicates success, non-zero is error + * @return 0 on success, errno otherwise */ -int unifyfs_stage_transfer(unifyfs_stage_t* ctx) +int unifyfs_stage_transfer(unifyfs_stage* ctx, + int file_index, + const char* src_file_path, + const char* dst_file_path) { + int err, rc; int ret = 0; - int count = 0; - FILE* fp = NULL; - char* src = NULL; - char* dst = NULL; - char linebuf[LINE_MAX] = { 0, }; + unifyfs_rc urc; - if (!ctx) { - return EINVAL; - } + const char* src = src_file_path; + const char* dst = dst_file_path; - fp = fopen(ctx->manifest_file, "r"); - if (!fp) { - fprintf(stderr, "failed to open file %s: %s\n", - ctx->manifest_file, strerror(errno)); - ret = errno; - goto out; + if (!ctx || (NULL == src) || (NULL == dst)) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "invalid stage_transfer() params\n"); + return EINVAL; } - while (NULL != fgets(linebuf, LINE_MAX - 1, fp)) { - if (strlen(linebuf) < 5) { - if (linebuf[0] == '\n') { - // manifest file ends in a blank line - goto out; - } else { - fprintf(stderr, "Short (bad) manifest file line: >%s<\n", - linebuf); - ret = -EINVAL; - goto out; - } - } - ret = unifyfs_parse_manifest_line(linebuf, &src, &dst); - if (ret < 0) { - fprintf(stderr, "failed to parse %s (%s)\n", - linebuf, strerror(ret)); - goto out; + /* decide which rank gets to manage the transfer */ + int mgr_rank = (file_index - 1) % ctx->total_ranks; + + bool src_in_unify = is_unifyfs_path(ctx->fshdl, src); + bool dst_in_unify = is_unifyfs_path(ctx->fshdl, dst); + if (src_in_unify && dst_in_unify) { + if (mgr_rank == ctx->rank) { + fprintf(stderr, + "UNIFYFS-STAGE ERROR: staging is not supported " + "for source (%s) and destination (%s) both in UnifyFS!\n", + src, dst); } - if (ctx->mode == UNIFYFS_STAGE_SERIAL) { - if (count % total_ranks == rank) { - if (verbose) { - fprintf(stdout, "[%d] serial transfer: src=%s, dst=%s\n", - rank, src, dst); - } + return EINVAL; + } - ret = unifyfs_transfer_file_serial(src, dst); - if (ret) { - fprintf(stderr, "[%d] failed to transfer file (err=%d)\n", - rank, -ret); - goto out; - } + if (ctx->mode == UNIFYFS_STAGE_MODE_SERIAL) { + /* use barrier to force sequential processing */ + MPI_Barrier(MPI_COMM_WORLD); + } - if (ret < 0) { - fprintf(stderr, "stat on %s failed (err=%d, %s)\n", - dst, errno, strerror(errno)); - ret = errno; - goto out; - } + if (src_in_unify && (mgr_rank == ctx->rank)) { + /* transfer manager rank initiates transfer using library API, + * other ranks do nothing */ + if (verbose) { + printf("[rank=%d] UNIFYFS-STAGE INFO: " + "transfer src=%s, dst=%s\n", + ctx->rank, src, dst); + } - if (ctx->checksum) { - ret = verify_checksum(src, dst); - if (ret) { - fprintf(stderr, "checksums for >%s< and >%s< differ!\n", - src, dst); - goto out; + unifyfs_transfer_request transfer = {0}; + transfer.src_path = src; + transfer.dst_path = dst; + transfer.mode = UNIFYFS_TRANSFER_MODE_COPY; + transfer.use_parallel = 1; + urc = unifyfs_dispatch_transfer(ctx->fshdl, 1, &transfer); + if (urc != UNIFYFS_SUCCESS) { + fprintf(stderr, "[rank=%d] UNIFYFS-STAGE ERROR: " + "transfer dispatch failed! %s\n", + ctx->rank, unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); + } else { + urc = unifyfs_wait_transfer(ctx->fshdl, 1, &transfer, 1); + if (urc != UNIFYFS_SUCCESS) { + fprintf(stderr, "[rank=%d] UNIFYFS-STAGE ERROR: " + "transfer wait failed! %s\n", + ctx->rank, unifyfs_rc_enum_description(urc)); + ret = unifyfs_rc_errno(urc); + } else { + if (transfer.result.rc == UNIFYFS_SUCCESS) { + if (verbose) { + printf("[rank=%d] UNIFYFS-STAGE INFO: " + "file size = %zu B, " + "transfer time = %.3f sec\n", + ctx->rank, + transfer.result.file_size_bytes, + transfer.result.transfer_time_seconds); } + } else { + ret = transfer.result.error; } } - } else { - if (verbose) { - fprintf(stdout, "[%d] parallel transfer: src=%s, dst=%s\n", - rank, src, dst); - } - - MPI_Barrier(MPI_COMM_WORLD); + } + } - ret = unifyfs_transfer_file_parallel(src, dst); - if (ret) { - fprintf(stderr, "[%d] failed to transfer file (err=%d)\n", - rank, -ret); - goto out; + if (dst_in_unify) { + unsigned long src_file_size = 0; + if (mgr_rank == ctx->rank) { + struct stat ss; + errno = 0; + rc = stat(src_file_path, &ss); + if (rc) { + err = errno; + fprintf(stderr, "[rank=%d] UNIFYFS-STAGE ERROR: " + "failed to stat(src=%s) - %s\n", + ctx->rank, src, strerror(err)); + ret = err; + } else { + src_file_size = (unsigned long) ss.st_size; + } + } + rc = MPI_Bcast((void*)&src_file_size, 1, MPI_UNSIGNED_LONG, + mgr_rank, MPI_COMM_WORLD); + if (rc != MPI_SUCCESS) { + char mpi_errstr[MPI_MAX_ERROR_STRING]; + int errstr_len = 0; + MPI_Error_string(rc, mpi_errstr, &errstr_len); + fprintf(stderr, "[rank=%d] UNIFYFS-STAGE ERROR: " + "MPI_Bcast() of source file size failed - %s\n", + ctx->rank, mpi_errstr); + ret = UNIFYFS_FAILURE; + } else { + size_t transfer_blksz = UNIFYFS_STAGE_TRANSFER_BLOCKSIZE; + size_t n_blocks = src_file_size / transfer_blksz; + if (src_file_size % transfer_blksz) { + n_blocks++; } - MPI_Barrier(MPI_COMM_WORLD); - - if (ctx->checksum && 0 == rank) { - ret = verify_checksum(src, dst); - if (ret) { - goto out; + if (ctx->data_dist == UNIFYFS_STAGE_DATA_BALANCED) { + /* spread source file data evenly across clients */ + rc = distribute_source_file_data(ctx, src, dst, + transfer_blksz, n_blocks); + if (rc) { + ret = rc; + fprintf(stderr, "[rank=%d] UNIFYFS-STAGE ERROR: " + "failed to distribute src=%s to dst=%s - %s\n", + ctx->rank, src, dst, strerror(ret)); } + } else { // UNIFYFS_STAGE_DATA_SKEWED + // TODO: implement skewed data distribution + ret = UNIFYFS_ERROR_NYI; } } - - count++; - } -out: - if (ret) { - fprintf(stderr, "failed to transfer file (src=%s, dst=%s): %s\n", - src, dst, strerror(ret)); } - if (fp) { - fclose(fp); - fp = NULL; + if (0 == ret) { /* transfer completed OK */ + if ((mgr_rank == ctx->rank) && (ctx->checksum)) { + rc = verify_checksum(ctx, src_in_unify, dst_in_unify, + src, dst); + if (rc) { + fprintf(stderr, + "[rank=%d] UNIFYFS-STAGE ERROR: " + "checksums differ for src=%s, dst=%s !\n", + ctx->rank, src, dst); + ret = rc; + } + } } + if (0 != ret) { /* something went wrong */ + if (mgr_rank == ctx->rank) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "stage transfer failed for index[%d] " + "src=%s, dst=%s (error=%d)\n", + file_index, src, dst, ret); + } + } return ret; } diff --git a/util/unifyfs-stage/src/unifyfs-stage.c b/util/unifyfs-stage/src/unifyfs-stage.c index 947d58e3f..31b51e1be 100644 --- a/util/unifyfs-stage/src/unifyfs-stage.c +++ b/util/unifyfs-stage/src/unifyfs-stage.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. + * Copyright (c) 2022, Lawrence Livermore National Security, LLC. * Produced at the Lawrence Livermore National Laboratory. * - * Copyright 2020, UT-Battelle, LLC. + * Copyright 2022, UT-Battelle, LLC. * * LLNL-CODE-741539 * All rights reserved. @@ -11,60 +11,34 @@ * For details, see https://github.com/LLNL/UnifyFS. * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. */ -/* unifyfs-stage: this application is supposed to excuted by the unifyfs - * command line utility for: - * - stage in: moving files in pfs to unifyfs volume before user starts - * application, - * e.g., unifyfs start --stage-in= - * - stage out: moving files in the unifyfs volume to parallel file system - * after user application completes, - * e.g., unifyfs terminate --stage-out= - * - * Currently, we request users to pass the to specify target - * files to be transferred. The should list all target files - * and their destinations, line by line. - * - * This supports two transfer modes (although both are technically parallel): - * - * - serial: Each process will transfer a file. Data of a single file will - * reside in a single compute node. - * - parallel (-p, --parallel): Each file will be split and transferred by all - * processes. Data of a single file will be spread evenly across all - * available compute nodes. - * - * TODO: - * Maybe later on, it would be better to have a size threshold. Based on the - * threshold, we can determine whether a file needs to transferred serially (if - * smaller than threshold), or parallelly. - */ + #include -#include -#include -#include -#include +#include #include +#include #include #include #include -#include +#include +#include +#include +#include -#include "unifyfs_const.h" #include "unifyfs-stage.h" -int rank; -int total_ranks; + int verbose; -static int debug; +static int debug_pause; static int checksum; -static int mode; -static int should_we_mount_unifyfs = 1; +static int data_distribution; +static int transfer_mode; static char* manifest_file; static char* mountpoint = "/unifyfs"; -static char* share_dir; +static char* status_file; -static unifyfs_stage_t _ctx; +static unifyfs_stage stage_ctx; /** * @brief create a status (lock) file to notify the unifyfs executable @@ -76,52 +50,43 @@ static unifyfs_stage_t _ctx; */ static int create_status_file(int status) { - char filename[PATH_MAX]; - FILE* fp = NULL; - const char* msg = status ? "fail" : "success"; - int return_val_from_scnprintf; - - return_val_from_scnprintf = - scnprintf(filename, PATH_MAX, - "%s/%s", share_dir, UNIFYFS_STAGE_STATUS_FILENAME); - if (return_val_from_scnprintf > (PATH_MAX - 1)) { - fprintf(stderr, "Stage status file is too long!\n"); - return -ENOMEM; - } + int err; - fp = fopen(filename, "w"); + errno = 0; + FILE* fp = fopen(status_file, "w"); + err = errno; if (!fp) { - fprintf(stderr, "failed to create %s (%s)\n", - filename, strerror(errno)); - return errno; + fprintf(stderr, "UNIFYFS-STAGE ERROR: failed to create %s (%s)\n", + status_file, strerror(err)); + return err; } + const char* msg = (status == 0 ? "success" : "fail"); fprintf(fp, "%s\n", msg); - fclose(fp); return 0; } +static char* short_opts = "cDhm:psS:v"; + static struct option long_opts[] = { { "checksum", 0, 0, 'c' }, - { "debug", 0, 0, 'd' }, + { "debug-pause", 0, 0, 'D' }, { "help", 0, 0, 'h' }, { "mountpoint", 1, 0, 'm' }, { "parallel", 0, 0, 'p' }, - { "share-dir", 1, 0, 's' }, + { "skewed", 0, 0, 's' }, + { "status-file", 1, 0, 'S' }, { "verbose", 0, 0, 'v' }, - { "no-mount-unifyfs", 0, 0, 'N' }, { 0, 0, 0, 0 }, }; -static char* short_opts = "cdhm:ps:vN"; - static const char* usage_str = "\n" "Usage: %s [OPTION]... \n" "\n" - "Transfer files between unifyfs volume and external file system.\n" + "Transfer files between UnifyFS volume and external file system.\n" "The should contain list of files to be transferred,\n" "and each line should be formatted as\n" "\n" @@ -135,49 +100,43 @@ static const char* usage_str = "\n" "Available options:\n" "\n" - " -c, --checksum verify md5 checksum for each transfer\n" - " -h, --help print this usage\n" - " -m, --mountpoint= use as unifyfs mountpoint\n" + " -c, --checksum Verify md5 checksum for each transfer\n" + " (default: off)\n" + " -h, --help Print usage information\n" + " -m, --mountpoint= Use as UnifyFS mountpoint\n" " (default: /unifyfs)\n" - " -p, --parallel transfer each file in parallel\n" - " (experimental)\n" - " -s, --share-dir= directory path for creating status file\n" - " -v, --verbose print noisy outputs\n" - " -N, --no-mount-unifyfs don't mount unifyfs file system (for testing)\n" - "\n" - "Without the '-p, --parallel' option, a file is transferred by a single\n" - "process. If the '-p, --parallel' option is specified, each file will be\n" - "divided by multiple processes and transferred in parallel.\n" + " -p, --parallel Transfer all files concurrently\n" + " (default: off, use sequential transfers)\n" + " -s, --skewed Use skewed data distribution for stage-in\n" + " (default: off, use balanced distribution)\n" + " -S, --status-file= Create stage status file at \n" + " -v, --verbose Print verbose information\n" + " (default: off)\n" "\n"; -static char* program; - -static void print_usage(void) +static void print_usage(char* program) { - if (0 == rank) { - fprintf(stdout, usage_str, program); - } + fprintf(stderr, usage_str, program); } -static -void debug_pause(int rank, const char* fmt, ...) +static int debug_hold; +static void pause_for_debug(int rank, const char* fmt, ...) { if (rank == 0) { - va_list args; - - va_start(args, fmt); - vfprintf(stderr, fmt, args); - va_end(args); - - fprintf(stderr, " ENTER to continue ... "); + debug_hold = 1; + fprintf(stderr, + "UNIFYFS-STAGE DEBUG - PAUSED: To continue execution, use " + "debugger to set 'debug_hold' variable = 0 in MPI rank 0\n"); + fflush(stderr); + while (debug_hold) { + sleep(1); + } - (void) getchar(); + fprintf(stderr, "UNIFYFS-STAGE DEBUG - CONTINUED\n"); + fflush(stderr); } MPI_Barrier(MPI_COMM_WORLD); - - /* internal accept() call from mpi may set errno */ - errno = 0; } static int parse_option(int argc, char** argv) @@ -186,6 +145,13 @@ static int parse_option(int argc, char** argv) int optidx = 0; char* filepath = NULL; + /* set defaults */ + checksum = 0; + debug_pause = 0; + status_file = NULL; + transfer_mode = UNIFYFS_STAGE_MODE_SERIAL; + data_distribution = UNIFYFS_STAGE_DATA_BALANCED; + if (argc < 2) { return EINVAL; } @@ -197,8 +163,8 @@ static int parse_option(int argc, char** argv) checksum = 1; break; - case 'd': - debug = 1; + case 'D': + debug_pause = 1; break; case 'm': @@ -206,20 +172,19 @@ static int parse_option(int argc, char** argv) break; case 'p': - mode = UNIFYFS_STAGE_PARALLEL; + transfer_mode = UNIFYFS_STAGE_MODE_PARALLEL; break; case 's': - share_dir = strdup(optarg); + data_distribution = UNIFYFS_STAGE_DATA_SKEWED; break; - case 'v': - verbose = 1; + case 'S': + status_file = strdup(optarg); break; - case 'N': - fprintf(stderr, "WARNING: not mounting unifyfs file system!\n"); - should_we_mount_unifyfs = 0; + case 'v': + verbose = 1; break; case 'h': @@ -236,88 +201,298 @@ static int parse_option(int argc, char** argv) manifest_file = realpath(filepath, NULL); if (!manifest_file) { - fprintf(stderr, "problem with accessing file %s: %s\n", + fprintf(stderr, + "UNIFYFS-STAGE ERROR: " + "could not access manifest file %s: %s\n", filepath, strerror(errno)); return errno; } + if (NULL == status_file) { + /* status_file is required for transfer status file */ + return EINVAL; + } + return 0; } -int main(int argc, char** argv) +void print_unifyfs_stage_context(unifyfs_stage* ctx) +{ + const char* mode_str = (ctx->mode == UNIFYFS_STAGE_MODE_SERIAL ? + "serial" : "parallel"); + const char* dist_str = (ctx->data_dist == UNIFYFS_STAGE_DATA_BALANCED ? + "balanced" : "skewed"); + fprintf(stderr, + "UNIFYFS-STAGE INFO: ==== stage context ====\n" + " : manifest file = %s\n" + " : mountpoint = %s\n" + " : transfer mode = %s\n" + " : data distribution = %s\n" + " : verify checksums = %d\n" + " : rank = %d of %d\n", + ctx->manifest_file, + ctx->mountpoint, + mode_str, + dist_str, + ctx->checksum, + ctx->rank, ctx->total_ranks); +} + +/* + * Parse a line from the manifest in the form of: + * + * + * + * If the paths have spaces, they must be quoted. + * + * On success, returns 0 along with allocated src_file and dst_file strings. + * These strings should be freed by the caller. + * + * On failure, returns non-zero, and set src and dst to NULL. + * + * Note, leading and tailing whitespace are ok. They just get ignored. + * Lines with only whitespace or starting with the comment character '#' + * are ignored, and the return value will be 0 with src and dst being NULL. + */ +/** + * @brief parses manifest file line, passes back src and dst strings + * + * @param line_number manifest file line number + * @param line manifest file line + * @param[out] src_file source file path + * @param[out] dst_file destination file path + * + * @return 0 if all was well, or there was nothing; non-zero on error + */ +int unifyfs_parse_manifest_line(int line_number, + char* line, + char** src_file, + char** dst_file) { + char* src = NULL; + char* dst = NULL; + char* copy; + char* tmp; + size_t copy_len; + size_t tmp_len; + size_t i; + int in_quotes = 0; int ret = 0; - unifyfs_stage_t* ctx = &_ctx; - program = basename(strdup(argv[0])); + + if ((NULL == line) || (NULL == src_file) || (NULL == dst_file)) { + return EINVAL; + } + + *src_file = NULL; + *dst_file = NULL; + + if ((line[0] == '\n') || (line[0] == '#')) { + // skip blank or comment (#) lines in manifest file + return 0; + } + + copy = strdup(line); + if (NULL == copy) { + return ENOMEM; + } + copy_len = strlen(copy) + 1; /* +1 for '\0' */ + + /* Replace quotes and separator with NUL character */ + for (i = 0; i < copy_len; i++) { + if (copy[i] == '"') { + in_quotes ^= 1;/* toggle */ + copy[i] = '\0'; + } else if (isspace(copy[i]) && !in_quotes) { + /* + * Allow any whitespace for our separator + */ + copy[i] = '\0'; + } + } + + /* copy now contains a series of strings, separated by NUL characters */ + tmp = copy; + while (tmp < (copy + copy_len)) { + tmp_len = strlen(tmp); + if (tmp_len > 0) { + /* We have a real string */ + if (!src) { + src = strdup(tmp); + if (NULL == src) { + return ENOMEM; + } + } else { + if (!dst) { + dst = strdup(tmp); + if (NULL == dst) { + return ENOMEM; + } + } else { + /* Error: a third file name */ + ret = EINVAL; + break; + } + } + } + tmp += tmp_len + 1; + } + free(copy); + + /* Some kind of error parsing a line */ + if ((ret != 0) || (NULL == src) || (NULL == dst)) { + if (NULL != src) { + free(src); + src = NULL; + } + if (NULL != dst) { + free(dst); + dst = NULL; + } + if (ret == 0) { + ret = EINVAL; + } + } else { + *src_file = src; + *dst_file = dst; + } + + return ret; +} + +int main(int argc, char** argv) +{ + int rc, ret; + int rank, total_ranks; + unifyfs_stage* ctx = &stage_ctx; + + char* program = basename(strdup(argv[0])); ret = parse_option(argc, argv); if (ret) { if (EINVAL == ret) { - print_usage(); + print_usage(program); } - goto preMPIout; + return ret; } MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &total_ranks); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - ctx->rank = rank; - ctx->total_ranks = total_ranks; + ctx->checksum = checksum; - ctx->mode = mode; - ctx->mountpoint = mountpoint; + ctx->data_dist = data_distribution; ctx->manifest_file = manifest_file; - -#if defined(ENABLE_MPI_MOUNT) - ctx->enable_mpi_mount = 1; -#endif - + ctx->mode = transfer_mode; + ctx->mountpoint = mountpoint; + ctx->rank = rank; + ctx->total_ranks = total_ranks; if (verbose) { - unifyfs_stage_print(ctx); + print_unifyfs_stage_context(ctx); } - if (debug) { - debug_pause(rank, "About to mount unifyfs.. "); + if (debug_pause) { + pause_for_debug(rank, "About to initialize UnifyFS.. "); } - if (should_we_mount_unifyfs && !ctx->enable_mpi_mount) { - ret = unifyfs_mount(mountpoint, rank, total_ranks); - if (ret) { - fprintf(stderr, "failed to mount unifyfs at %s (%s)", - ctx->mountpoint, strerror(ret)); - goto out; - } + // initialize UnifyFS API handle for transfer + unifyfs_handle fshdl = UNIFYFS_INVALID_HANDLE; /* client handle */ + unifyfs_rc urc = unifyfs_initialize(ctx->mountpoint, NULL, 0, &(fshdl)); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "UnifyFS initialization for mntpt=%s failed (%s)", + ctx->mountpoint, unifyfs_rc_enum_description(urc)); + ret = -1; + MPI_Abort(MPI_COMM_WORLD, ret); } + ctx->fshdl = fshdl; MPI_Barrier(MPI_COMM_WORLD); - ret = unifyfs_stage_transfer(ctx); - if (ret) { - fprintf(stderr, "data transfer failed (%s)\n", strerror(errno)); + /* TODO - Currently, all ranks open and parse the manifest file. It may be + * better if rank 0 does that and then broadcasts the file pairs */ + FILE* manifest = NULL; + manifest = fopen(ctx->manifest_file, "r"); + if (!manifest) { + if (rank == 0) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "failed to open manifest file %s (%s)\n", + ctx->manifest_file, strerror(errno)); + } + ret = errno; + MPI_Abort(MPI_COMM_WORLD, ret); + } + + int line_count = 0; + int file_count = 0; + int n_xfer_failures = 0; + char* src = NULL; + char* dst = NULL; + char linebuf[LINE_MAX] = { 0, }; + while (NULL != fgets(linebuf, LINE_MAX - 1, manifest)) { + line_count++; + rc = unifyfs_parse_manifest_line(line_count, linebuf, &src, &dst); + if (EINVAL == rc) { + if (rank == 0) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "manifest line[%d] is invalid! - '%s'\n", + line_count, linebuf); + } + ret = rc; + } else if ((0 == rc) && (NULL != src) && (NULL != dst)) { + file_count++; + rc = unifyfs_stage_transfer(ctx, file_count, src, dst); + if (rc) { + if (rc != EINVAL) { + n_xfer_failures++; + } + ret = rc; + } + } } - /* wait until all processes are done */ + // wait until all processes are done MPI_Barrier(MPI_COMM_WORLD); - if (share_dir && rank == 0) { - ret = create_status_file(ret); - if (ret) { - fprintf(stderr, "failed to create the status file (%s)\n", - strerror(errno)); - } + // use a global reduction to sum total number of transfer failures + int total_xfer_failures = 0; + rc = MPI_Reduce(&n_xfer_failures, &total_xfer_failures, + 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + if (rc != MPI_SUCCESS) { + char errstr[MPI_MAX_ERROR_STRING]; + int len = 0; + MPI_Error_string(rc, errstr, &len); + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "failed to aggregate transfer failures (error='%s')\n", + errstr); + total_xfer_failures = 1; + } + if (0 == ret) { + ret = total_xfer_failures; } - if (should_we_mount_unifyfs && !ctx->enable_mpi_mount) { - ret = unifyfs_unmount(); - if (ret) { - fprintf(stderr, "unmounting unifyfs failed (ret=%d)\n", ret); + // rank 0 reports overall status + if (rank == 0) { + rc = create_status_file(ret); + if (rc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "failed to create stage status file (error=%d)\n", + rc); + ret = rc; } } -out: + + // finalize UnifyFS API handle + urc = unifyfs_finalize(fshdl); + if (UNIFYFS_SUCCESS != urc) { + fprintf(stderr, "UNIFYFS-STAGE ERROR: " + "UnifyFS finalization failed - %s", + unifyfs_rc_enum_description(urc)); + } + fshdl = UNIFYFS_INVALID_HANDLE; + MPI_Finalize(); -preMPIout: return ret; } diff --git a/util/unifyfs-stage/src/unifyfs-stage.h b/util/unifyfs-stage/src/unifyfs-stage.h index 229ad9648..48b7ee1cc 100644 --- a/util/unifyfs-stage/src/unifyfs-stage.h +++ b/util/unifyfs-stage/src/unifyfs-stage.h @@ -1,67 +1,71 @@ -#ifndef __UNIFYFS_STAGE_H -#define __UNIFYFS_STAGE_H +#ifndef UNIFYFS_STAGE_H +#define UNIFYFS_STAGE_H -#include -#include -#include +#include "unifyfs_api.h" +#include "unifyfs_misc.h" -#define UNIFYFS_STAGE_MD5_BLOCKSIZE (1048576) +#ifndef UNIFYFS_STAGE_MD5_BLOCKSIZE +#define UNIFYFS_STAGE_MD5_BLOCKSIZE (1048576) +#endif + +#ifndef UNIFYFS_STAGE_TRANSFER_BLOCKSIZE +#define UNIFYFS_STAGE_TRANSFER_BLOCKSIZE (16 * 1048576) +#endif + +extern int verbose; -/* - * serial: each file is tranferred by a process. - * parallel: a file is transferred by all processes. - */ enum { - UNIFYFS_STAGE_SERIAL = 0, - UNIFYFS_STAGE_PARALLEL = 1, + UNIFYFS_STAGE_MODE_SERIAL = 0, /* sequential file transfers */ + UNIFYFS_STAGE_MODE_PARALLEL = 1, /* concurrent file transfers */ + UNIFYFS_STAGE_DATA_BALANCED = 2, /* balanced data placement */ + UNIFYFS_STAGE_DATA_SKEWED = 3 /* skewed data placement */ }; struct _unifyfs_stage { + int checksum; /* perform checksum? 0:no, 1:yes */ + int data_dist; /* data distribution? UNIFYFS_STAGE_DATA_xxxx */ + int mode; /* transfer mode? UNIFYFS_STAGE_MODE_xxxx */ + int rank; /* my rank */ int total_ranks; /* mpi world size */ - int checksum; /* perform checksum? 0:no, 1:yes */ - int mode; /* transfer mode? 0:serial, 1:parallel */ - int should_we_mount_unifyfs; /* mount? 0:no (for testing), 1: yes */ - int enable_mpi_mount; /* automount during MPI_Init() */ char* mountpoint; /* unifyfs mountpoint */ char* manifest_file; /* manifest file containing the transfer list */ + + unifyfs_handle fshdl; /* UnifyFS API client handle */ }; +typedef struct _unifyfs_stage unifyfs_stage; -typedef struct _unifyfs_stage unifyfs_stage_t; +void print_unifyfs_stage_context(unifyfs_stage* ctx); -static inline void unifyfs_stage_print(unifyfs_stage_t* ctx) -{ - printf("== unifyfs stage context ==\n" - "rank = %d\n" - "total ranks = %d\n" - "checksum = %d\n" - "mode = %d\n" - "should_we_mount_unifyfs = %d\n" - "mpi_mount = %d\n" - "mountpoint = %s\n" - "manifest file = %s\n", - ctx->rank, - ctx->total_ranks, - ctx->checksum, - ctx->mode, - ctx->should_we_mount_unifyfs, - ctx->enable_mpi_mount, - ctx->mountpoint, - ctx->manifest_file); -} +/** + * @brief parses manifest file line, passes back src and dst strings + * + * @param line_number manifest file line number + * @param line manifest file line + * @param[out] src_file source file path + * @param[out] dst_file destination file path + * + * @return 0 if all was well, or there was nothing; non-zero on error + */ +int unifyfs_parse_manifest_line(int line_number, + char* line, + char** src_file, + char** dst_file); /** - * @brief transfer files specified in @ctx + * @brief transfer source file to destination according to stage context * - * @param ctx unifyfs_stage_t data transfer context + * @param ctx stage context + * @param file_index file index within manifest + * @param src_file_path source file path + * @param dst_file_path destination file path * * @return 0 on success, errno otherwise */ -int unifyfs_stage_transfer(unifyfs_stage_t* ctx); - -extern int verbose; -extern int rank; -extern int total_ranks; +int unifyfs_stage_transfer(unifyfs_stage* ctx, + int file_index, + const char* src_file_path, + const char* dst_file_path); -#endif /* __UNIFYFS_STAGE_H */ +#endif /* UNIFYFS_STAGE_H */ diff --git a/util/unifyfs/src/unifyfs-rm.c b/util/unifyfs/src/unifyfs-rm.c index 37319347b..76fbab847 100644 --- a/util/unifyfs/src/unifyfs-rm.c +++ b/util/unifyfs/src/unifyfs-rm.c @@ -190,6 +190,24 @@ static int write_hostfile(unifyfs_resource_t* resource, return ret; } +static inline +int construct_server_pids_filename(unifyfs_args_t* args) +{ + char filename[PATH_MAX]; + int rc = scnprintf(filename, sizeof(filename), "%s/%s", + args->share_dir, UNIFYFS_SERVER_PID_FILENAME); + if (rc > (sizeof(filename) - 2)) { + fprintf(stderr, "Unifyfs status filename is too long!\n"); + return ENOMEM; + } else { + args->share_pidfile = strdup(filename); + if (NULL == args->share_pidfile) { + return ENOMEM; + } + } + return 0; +} + /** * @brief wait until servers become ready for client connections * @@ -201,46 +219,33 @@ static int write_hostfile(unifyfs_resource_t* resource, static int wait_server_initialization(unifyfs_resource_t* resource, unifyfs_args_t* args) { + int err; int ret = UNIFYFS_SUCCESS; - int count = 0; + size_t count = 0; unsigned int interval = 3; unsigned int wait_time = 0; FILE* fp = NULL; char linebuf[32]; - char filename[PATH_MAX]; - int return_val_from_scnprintf; - - return_val_from_scnprintf = - scnprintf(filename, PATH_MAX, - "%s/%s", args->share_dir, UNIFYFS_SERVER_PID_FILENAME); - if (return_val_from_scnprintf > (PATH_MAX - 2)) { - fprintf(stderr, "Unifyfs status filename is too long!\n"); - return -ENOMEM; - } while (1) { - int err; errno = 0; - fp = fopen(filename, "r"); + fp = fopen(args->share_pidfile, "r"); err = errno; if (fp) { while (fgets(linebuf, 31, fp) != NULL) { count++; } - if (count != resource->n_nodes) { fprintf(stderr, - "incorrect server initialization: " - "expected %lu processes but only %u processes found\n", - resource->n_nodes, count); + "only found %zu of %zu server processes\n", + count, resource->n_nodes); ret = UNIFYFS_FAILURE; } - fclose(fp); break; - } else if (err != ENOENT) { + } else if (ENOENT != err) { fprintf(stderr, "failed to open file %s (%s)\n", - filename, strerror(err)); + args->share_pidfile, strerror(err)); ret = -err; break; } @@ -268,6 +273,25 @@ static inline unsigned int estimate_timeout(const char* manifest_file) return 20 * 60; } +static inline +int construct_stage_status_filename(unifyfs_args_t* args) +{ + char filename[PATH_MAX]; + int rc = scnprintf(filename, sizeof(filename), "%s/%s.%s", + args->share_dir, + UNIFYFS_STAGE_STATUS_FILENAME, + (args->stage_in ? "in" : "out")); + if (rc > (sizeof(filename) - 2)) { + fprintf(stderr, "UnifyFS status filename is too long!\n"); + return ENOMEM; + } else { + args->stage_status = strdup(filename); + if (NULL == args->stage_status) { + return ENOMEM; + } + } + return 0; +} /** * @brief wait until data stage operation finishes @@ -280,6 +304,7 @@ static inline unsigned int estimate_timeout(const char* manifest_file) static int wait_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) { + int err; int ret = UNIFYFS_SUCCESS; unsigned int interval = 5; unsigned int wait_time = 0; @@ -288,15 +313,6 @@ int wait_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) const char* manifest_file = NULL; char filename[PATH_MAX]; char linebuf[16]; - int return_val_from_scnprintf; - - return_val_from_scnprintf = - scnprintf(filename, PATH_MAX, - "%s/%s", args->share_dir, UNIFYFS_STAGE_STATUS_FILENAME); - if (return_val_from_scnprintf > (PATH_MAX - 2)) { - fprintf(stderr, "Unifyfs status filename is too long!\n"); - return -ENOMEM; - } if (args->stage_timeout > 0) { timeout = args->stage_timeout; @@ -305,29 +321,31 @@ int wait_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) } while (1) { - fp = fopen(filename, "r"); + errno = 0; + fp = fopen(args->stage_status, "r"); + err = errno; if (fp) { char* line = fgets(linebuf, 15, fp); if (0 == strncmp("success", line, strlen("success"))) { + // transfer completed successfully fclose(fp); fp = NULL; ret = 0; - break; // transfer completed + break; } else if (0 == strncmp("fail", line, strlen("fail"))) { + // transfer failed fclose(fp); fp = NULL; ret = -EIO; - break; // transfer failed - } else { - fclose(fp); // try again + break; + } else { // opened, but no content yet? try again + fclose(fp); + fp = NULL; } - } - - - if (errno != ENOENT) { + } else if (ENOENT != err) { fprintf(stderr, "failed to open file %s (%s)\n", - UNIFYFS_STAGE_STATUS_FILENAME, strerror(errno)); - ret = -errno; + filename, strerror(err)); + ret = -err; break; } @@ -351,26 +369,19 @@ int wait_stage(unifyfs_resource_t* resource, unifyfs_args_t* args) */ static int remove_server_pid_file(unifyfs_args_t* args) { + int err, rc; int ret = 0; - char filename[PATH_MAX]; - int return_val_from_scnprintf; - return_val_from_scnprintf = - scnprintf(filename, PATH_MAX, - "%s/%s", args->share_dir, UNIFYFS_SERVER_PID_FILENAME); - if (return_val_from_scnprintf > (PATH_MAX - 2)) { - fprintf(stderr, "Unifyfs status filename is too long!\n"); - return -ENOMEM; - } - - ret = unlink(filename); - if (ret) { - if (ENOENT == errno) { + errno = 0; + rc = unlink(args->share_pidfile); + err = errno; + if (rc) { + if (ENOENT == err) { ret = 0; } else { fprintf(stderr, "failed to unlink existing pid file %s (%s)\n", - filename, strerror(errno)); - ret = -errno; + args->share_pidfile, strerror(err)); + ret = -err; } } @@ -385,26 +396,20 @@ static int remove_server_pid_file(unifyfs_args_t* args) */ static int remove_stage_status_file(unifyfs_args_t* args) { + int err, rc; int ret = 0; - char filename[PATH_MAX]; - int return_val_from_scnprintf; - return_val_from_scnprintf = - scnprintf(filename, PATH_MAX, - "%s/%s", args->share_dir, UNIFYFS_STAGE_STATUS_FILENAME); - if (return_val_from_scnprintf > (PATH_MAX - 2)) { - fprintf(stderr, "Unifyfs stage status filename is too long!\n"); - return -ENOMEM; - } - - ret = unlink(filename); - if (ret) { - if (ENOENT == errno) { + errno = 0; + rc = unlink(args->stage_status); + err = errno; + if (rc) { + if (ENOENT == err) { ret = 0; } else { - fprintf(stderr, "failed to unlink existing stage status file " - "%s (%s)\n", filename, strerror(errno)); - ret = -errno; + fprintf(stderr, + "failed to unlink existing stage status file %s (%s)\n", + args->stage_status, strerror(err)); + ret = -err; } } @@ -693,7 +698,7 @@ static size_t construct_server_argv(unifyfs_args_t* args, // future, this may be reconfigured to have more, to support // more files being staged in or out more quickly. /** - * @brief Constructs argument chain to mpi-start (or terminate) + * @brief Constructs argument chain to start (or terminate) * unifyfs-stage stagein/out process. * * @param args The command-line options @@ -707,7 +712,7 @@ static size_t construct_stage_argv(unifyfs_args_t* args, size_t argc = 0; if (stage_argv != NULL) { - stage_argv[0] = strdup(LIBEXECDIR "/unifyfs-stage"); + stage_argv[0] = strdup(BINDIR "/unifyfs-stage"); } argc = 1; @@ -719,12 +724,26 @@ static size_t construct_stage_argv(unifyfs_args_t* args, argc += 2; } + if (args->debug) { + if (stage_argv != NULL) { + stage_argv[argc] = strdup("--verbose"); + } + argc += 1; + } + + if (args->stage_parallel) { + if (stage_argv != NULL) { + stage_argv[argc] = strdup("--parallel"); + } + argc += 1; + } + if (stage_argv != NULL) { char* manifest_file = args->stage_in ? args->stage_in - : args->stage_out; + : args->stage_out; - stage_argv[argc] = strdup("-s"); - stage_argv[argc + 1] = strdup(args->share_dir); + stage_argv[argc] = strdup("--status-file"); + stage_argv[argc + 1] = strdup(args->stage_status); stage_argv[argc + 2] = strdup(manifest_file); } argc += 3; @@ -781,17 +800,23 @@ static int generic_stage(char* cmd, int run_argc, unifyfs_args_t* args) stage_argc = construct_stage_argv(args, NULL); - char* token = strtok(cmd, " "); argc = 1 + run_argc + stage_argc; argv = calloc(argc, sizeof(char*)); + char* token = strtok(cmd, " "); for (int i = 0; i < run_argc; i++) { argv[i] = token; token = strtok(NULL, " "); } construct_stage_argv(args, argv + run_argc); - + if (args->debug) { + for (int i = 0; i < (argc - 1); i++) { + fprintf(stdout, "UNIFYFS DEBUG: stage_argv[%d] = %s\n", + i, argv[i]); + fflush(stdout); + } + } execvp(argv[0], argv); return -errno; @@ -904,7 +929,7 @@ static int jsrun_stage(unifyfs_resource_t* resource, generic_stage(cmd, jsrun_argc, args); - perror("failed to execvp() mpirun to handle data stage"); + perror("failed to execvp() jsrun to handle data stage"); return -errno; } @@ -1256,13 +1281,19 @@ int unifyfs_start_servers(unifyfs_resource_t* resource, rc = write_hostfile(resource, args); if (rc) { - fprintf(stderr, "Failed to write shared server hostfile!\n"); + fprintf(stderr, "Failed to write server hosts file!\n"); + return rc; + } + + rc = construct_server_pids_filename(args); + if (rc) { + fprintf(stderr, "Failed to construct server pids filename!\n"); return rc; } rc = remove_server_pid_file(args); if (rc) { - fprintf(stderr, "Failed to remove server pid file!\n"); + fprintf(stderr, "Failed to remove server pids file!\n"); return rc; } @@ -1285,6 +1316,11 @@ int unifyfs_start_servers(unifyfs_resource_t* resource, } if (args->stage_in) { + rc = construct_stage_status_filename(args); + if (rc) { + return rc; + } + rc = remove_stage_status_file(args); if (rc) { fprintf(stderr, "Failed to remove stage status file\n"); @@ -1321,6 +1357,11 @@ int unifyfs_stop_servers(unifyfs_resource_t* resource, } if (args->stage_out) { + rc = construct_stage_status_filename(args); + if (rc) { + return rc; + } + rc = remove_stage_status_file(args); if (rc) { fprintf(stderr, "Failed to remove stage status file\n"); diff --git a/util/unifyfs/src/unifyfs.c b/util/unifyfs/src/unifyfs.c index 23438b71d..cdc30584b 100644 --- a/util/unifyfs/src/unifyfs.c +++ b/util/unifyfs/src/unifyfs.c @@ -76,13 +76,14 @@ static struct option const long_opts[] = { { "share-dir", required_argument, NULL, 'S' }, { "stage-in", required_argument, NULL, 'i' }, { "stage-out", required_argument, NULL, 'o' }, - { "timeout", required_argument, NULL, 't' }, + { "stage-parallel", no_argument, NULL, 'P' }, { "stage-timeout", required_argument, NULL, 'T' }, + { "timeout", required_argument, NULL, 't' }, { 0, 0, 0, 0 }, }; static char* program; -static char* short_opts = ":cC:de:hi:m:o:s:S:t:T:"; +static char* short_opts = ":cC:de:hi:m:o:Ps:S:t:T:"; static char* usage_str = "\n" "Usage: %s [options...]\n" @@ -104,10 +105,12 @@ static char* usage_str = " -S, --share-dir= [REQUIRED] shared file system for use by servers\n" " -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit\n" " -i, --stage-in= [OPTIONAL] stage in file(s) listed in file\n" + " -P, --stage-parallel [OPTIONAL] use parallel stage-in\n" " -T, --stage-timeout= [OPTIONAL] timeout for stage-in operation\n" "\n" "Command options for \"terminate\":\n" " -o, --stage-out= [OPTIONAL] stage out file(s) listed in on termination\n" + " -P, --stage-parallel [OPTIONAL] use parallel stage-out\n" " -T, --stage-timeout= [OPTIONAL] timeout for stage-out operation\n" " -s, --script= [OPTIONAL] to custom termination script\n" " -S, --share-dir= [REQUIRED for --stage-out] shared file system for use by servers\n" @@ -127,6 +130,7 @@ static void parse_cmd_arguments(int argc, char** argv) int optidx = 2; int cleanup = 0; int timeout = UNIFYFS_DEFAULT_INIT_TIMEOUT; + int stage_parallel = 0; int stage_timeout = -1; unifyfs_cm_e consistency = UNIFYFS_CM_LAMINATED; char* mountpoint = NULL; @@ -165,6 +169,10 @@ static void parse_cmd_arguments(int argc, char** argv) mountpoint = strdup(optarg); break; + case 'P': + stage_parallel = 1; + break; + case 's': script = strdup(optarg); break; @@ -210,6 +218,7 @@ static void parse_cmd_arguments(int argc, char** argv) cli_args.share_dir = share_dir; cli_args.stage_in = stage_in; cli_args.stage_out = stage_out; + cli_args.stage_parallel = stage_parallel; cli_args.stage_timeout = stage_timeout; cli_args.timeout = timeout; } @@ -247,12 +256,14 @@ int main(int argc, char** argv) printf("cleanup:\t%d\n", cli_args.cleanup); printf("consistency:\t%s\n", unifyfs_cm_enum_str(cli_args.consistency)); + printf("debug:\t%d\n", cli_args.debug); printf("mountpoint:\t%s\n", cli_args.mountpoint); printf("script:\t%s\n", cli_args.script); printf("share_dir:\t%s\n", cli_args.share_dir); printf("server:\t%s\n", cli_args.server_path); printf("stage_in:\t%s\n", cli_args.stage_in); printf("stage_out:\t%s\n", cli_args.stage_out); + printf("stage_parallel:\t%d\n", cli_args.stage_parallel); printf("stage_timeout:\t%d\n", cli_args.stage_timeout); } diff --git a/util/unifyfs/src/unifyfs.h b/util/unifyfs/src/unifyfs.h index 7f6103969..f986ce5e0 100644 --- a/util/unifyfs/src/unifyfs.h +++ b/util/unifyfs/src/unifyfs.h @@ -62,10 +62,13 @@ struct _unifyfs_args { char* mountpoint; /* mountpoint */ char* server_path; /* full path to installed unifyfsd */ char* share_dir; /* full path to shared file system directory */ - char* share_hostfile; /* full path to shared server hostfile */ - char* stage_in; /* data path to stage-in */ - char* stage_out; /* data path to stage-out (drain) */ - int stage_timeout; /* timeout of (in or out) file staging*/ + char* share_hostfile; /* full path to shared server hosts file */ + char* share_pidfile; /* full path to shared server pids file */ + char* stage_in; /* full path to stage-in manifest file */ + char* stage_out; /* full path to stage-out manifest file */ + char* stage_status; /* full path to stage-in/out status file */ + int stage_parallel; /* enable parallal stage-in/out */ + int stage_timeout; /* timeout of (in or out) file staging */ char* script; /* path to custom launch/terminate script */ }; typedef struct _unifyfs_args unifyfs_args_t; From 3ecec03700436b072ba400115e4d26763938003a Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Wed, 8 Jun 2022 16:00:08 -0400 Subject: [PATCH 67/81] Disable GCC11 warning on call to PMIx_Get() The version of PMIx included with Spectrum MPI defines the PMIx_Get() function to take a parameter that ends up being a 'const char[512]'. However, the caller is expected to pass in a static const string that is less than 512 bytes long. GCC11 issues a warning about this size mismatch, and since we build with `-Werror`, that breaks the build. This commit adds some #pragma lines to disable that particular check for the call to PMIx_Get() when building with GCC 11. --- common/src/unifyfs_keyval.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index 9187215fb..1cad4b2c4 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -372,7 +372,25 @@ int unifyfs_pmix_init(void) PMIX_PROC_CONSTRUCT(&proc); strlcpy(proc.nspace, pmix_myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; + +// This is a kludge for working around a bug in the version of PMIx installed +// on Summit (which is part of Spectrum MPI 10.4.0.3). Specifically, gcc 11 +// complains that the 2nd parameter - which is type 'const pmix_key_t', which +// in turn is 'char [512]' - is longer than the const string that actually gets +// passed in (which is itself defined in pmix_common.h). +// It would be nice to limit this to just Summit, but there doesn't seem to be +// an easy way to do that (no pre-defined macro we can check, for example). +// Once the bug is fixed and the version on Summit is updated, we can get rid +// of this test and just use the line in the 'else' clause +#if (__GNUC__ >= 11) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-overread" + rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &valp); +#pragma GCC diagnostic pop +#else rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &valp); +#endif + if (rc != PMIX_SUCCESS) { LOGERR("PMIx rank %d: PMIx_Get(JOB_SIZE) failed: %s", pmix_myproc.rank, PMIx_Error_string(rc)); From d50663be994c0379c4843a2c4a4a2827a177fadd Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Thu, 16 Jun 2022 14:19:02 -0400 Subject: [PATCH 68/81] A different fix for the PMIx_Get() gcc11 warning Replace the #ifdef and #pragma lines with a call to strcpy(). The code is cleaner and the extra cpu and memory usage from the copy is negligable. --- common/src/unifyfs_keyval.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index 1cad4b2c4..f1074e1aa 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -373,23 +373,13 @@ int unifyfs_pmix_init(void) strlcpy(proc.nspace, pmix_myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; -// This is a kludge for working around a bug in the version of PMIx installed -// on Summit (which is part of Spectrum MPI 10.4.0.3). Specifically, gcc 11 -// complains that the 2nd parameter - which is type 'const pmix_key_t', which -// in turn is 'char [512]' - is longer than the const string that actually gets -// passed in (which is itself defined in pmix_common.h). -// It would be nice to limit this to just Summit, but there doesn't seem to be -// an easy way to do that (no pre-defined macro we can check, for example). -// Once the bug is fixed and the version on Summit is updated, we can get rid -// of this test and just use the line in the 'else' clause -#if (__GNUC__ >= 11) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstringop-overread" + // Note: we do an extra copy because passing PMIX_JOB_SIZE directly to + // PMIx_Get() causes gcc 11 to generate a warning due to the fact that + // PMIX_JOB_SIZE evaluates to a 14 byte char array while pmix_key_t is + // (at least) 64 bytes. + pmix_key_t key; + strcpy(key, PMIX_JOB_SIZE); rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &valp); -#pragma GCC diagnostic pop -#else - rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &valp); -#endif if (rc != PMIX_SUCCESS) { LOGERR("PMIx rank %d: PMIx_Get(JOB_SIZE) failed: %s", From de4c748f6afc5f9d3ea8d0dd072309444d59ed81 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Thu, 16 Jun 2022 14:37:35 -0400 Subject: [PATCH 69/81] Pass correct variable to PMIx_Get() Pass the copied value into PMIx_Get() rather than the original defined string. (This should have been part of the previous commit.) --- common/src/unifyfs_keyval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index f1074e1aa..31693a25d 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -379,7 +379,7 @@ int unifyfs_pmix_init(void) // (at least) 64 bytes. pmix_key_t key; strcpy(key, PMIX_JOB_SIZE); - rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &valp); + rc = PMIx_Get(&proc, key, NULL, 0, &valp); if (rc != PMIX_SUCCESS) { LOGERR("PMIx rank %d: PMIx_Get(JOB_SIZE) failed: %s", From 5a4215db1485958e079543a9cf56529ef4b982f7 Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Thu, 7 Jul 2022 10:25:54 -0400 Subject: [PATCH 70/81] Use strlcpy() instead of strcpy() Use strlcpy() when copying the PMIX_JOB_SIZE key. --- common/src/unifyfs_keyval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/unifyfs_keyval.c b/common/src/unifyfs_keyval.c index 31693a25d..61e086edb 100644 --- a/common/src/unifyfs_keyval.c +++ b/common/src/unifyfs_keyval.c @@ -378,7 +378,7 @@ int unifyfs_pmix_init(void) // PMIX_JOB_SIZE evaluates to a 14 byte char array while pmix_key_t is // (at least) 64 bytes. pmix_key_t key; - strcpy(key, PMIX_JOB_SIZE); + strlcpy(key, PMIX_JOB_SIZE, sizeof(pmix_key_t)); rc = PMIx_Get(&proc, key, NULL, 0, &valp); if (rc != PMIX_SUCCESS) { From fdf24928178f221b818fa3f0143f6dcbe3d790a4 Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Wed, 22 Jun 2022 08:00:53 -0400 Subject: [PATCH 71/81] fix re-initialization of API client config --- client/src/unifyfs.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/client/src/unifyfs.c b/client/src/unifyfs.c index f4909c60d..07dc51dcd 100644 --- a/client/src/unifyfs.c +++ b/client/src/unifyfs.c @@ -383,14 +383,16 @@ int unifyfs_client_init(unifyfs_client* client) // print log messages to stderr unifyfs_log_open(NULL); - // initialize configuration + // initialize configuration (if not already done) unifyfs_cfg_t* client_cfg = &(client->cfg); - rc = unifyfs_config_init(client_cfg, 0, NULL, 0, NULL); - if (rc) { - LOGERR("failed to initialize configuration."); - return UNIFYFS_FAILURE; + if (client_cfg->ptype != UNIFYFS_CLIENT) { + rc = unifyfs_config_init(client_cfg, 0, NULL, 0, NULL); + if (rc) { + LOGERR("failed to initialize configuration."); + return UNIFYFS_FAILURE; + } + client_cfg->ptype = UNIFYFS_CLIENT; } - client_cfg->ptype = UNIFYFS_CLIENT; // set log level from config char* cfgval = client_cfg->log_verbosity; From 0156626c1defc1f68059ec542cef174e49519d1d Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Mon, 20 Jun 2022 14:15:26 -0400 Subject: [PATCH 72/81] disallow unifyfs_create() of existing file Only disallowed for the same client that did the original create. Also, add a unit test for this case. --- client/src/unifyfs-sysio.c | 15 ++++++- client/src/unifyfs_fid.c | 84 ++++++++++++++++++++++---------------- client/src/unifyfs_fid.h | 2 +- t/api/create-open-remove.c | 6 +++ 4 files changed, 69 insertions(+), 38 deletions(-) diff --git a/client/src/unifyfs-sysio.c b/client/src/unifyfs-sysio.c index 96d8c54a0..dbf2f1bfc 100644 --- a/client/src/unifyfs-sysio.c +++ b/client/src/unifyfs-sysio.c @@ -1115,6 +1115,11 @@ static int posix_create(char* upath, mode_t mode) int flags = O_WRONLY | O_CREAT | O_TRUNC; off_t pos; int rc = unifyfs_fid_open(posix_client, upath, flags, mode, &fid, &pos); + if (rc == EEXIST) { + /* POSIX allows O_CREAT on existing file */ + flags = O_WRONLY | O_TRUNC; + rc = unifyfs_fid_open(posix_client, upath, flags, mode, &fid, &pos); + } if (rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(rc); return -1; @@ -1189,11 +1194,19 @@ int UNIFYFS_WRAP(open)(const char* path, int flags, ...) if (unifyfs_intercept_path(path, upath)) { /* TODO: handle relative paths using current working directory */ - /* create the file */ + /* open the file */ int fid; off_t pos; int rc = unifyfs_fid_open(posix_client, upath, flags, mode, &fid, &pos); + if (rc == EEXIST) { + /* POSIX allows O_CREAT on existing file */ + if ((flags & O_CREAT) && !(flags & O_EXCL)) { + flags -= O_CREAT; + rc = unifyfs_fid_open(posix_client, upath, flags, mode, + &fid, &pos); + } + } if (rc != UNIFYFS_SUCCESS) { errno = unifyfs_rc_errno(rc); return -1; diff --git a/client/src/unifyfs_fid.c b/client/src/unifyfs_fid.c index e7899c4d3..3f46beff3 100644 --- a/client/src/unifyfs_fid.c +++ b/client/src/unifyfs_fid.c @@ -194,7 +194,7 @@ int unifyfs_set_global_file_meta_from_fid(unifyfs_client* client, * returns the new fid, or negative value on error */ int unifyfs_fid_create_file(unifyfs_client* client, const char* path, - int exclusive) + int private) { /* check that pathname is within bounds */ size_t pathlen = strlen(path) + 1; @@ -232,7 +232,7 @@ int unifyfs_fid_create_file(unifyfs_client* client, meta->attrs.size = 0; meta->attrs.mode = UNIFYFS_STAT_DEFAULT_FILE_MODE; meta->attrs.is_laminated = 0; - meta->attrs.is_shared = !exclusive; + meta->attrs.is_shared = !private; meta->attrs.filename = (char*)&(client->unifyfs_filelist[fid].filename); /* use client user/group */ @@ -378,7 +378,8 @@ int unifyfs_fid_open( int* outfid, /* allocated local file id if open is successful */ off_t* outpos) /* initial file position if open is successful */ { - int ret; + int rc; + int ret = UNIFYFS_SUCCESS; /* set the pointer to the start of the file */ off_t pos = 0; @@ -414,7 +415,7 @@ int unifyfs_fid_open( /* if O_CREAT, * if not local, allocate fid and storage - * create from local fid meta + * create local fid meta * attempt to create global inode * if EEXIST and O_EXCL, error and release fid/storage * lookup global meta @@ -444,8 +445,8 @@ int unifyfs_fid_open( } /* initialize local storage for this file */ - ret = fid_storage_alloc(client, fid); - if (ret != UNIFYFS_SUCCESS) { + rc = fid_storage_alloc(client, fid); + if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to allocate storage space for file %s (fid=%d)", path, fid); unifyfs_fid_delete(client, fid); @@ -455,42 +456,54 @@ int unifyfs_fid_open( /* TODO: set meta->mode bits to mode variable */ } - /* insert file attribute for file in key-value store */ + /* create global file metadata */ unifyfs_file_attr_op_e op = UNIFYFS_FILE_ATTR_OP_CREATE; ret = unifyfs_set_global_file_meta_from_fid(client, fid, op); - if (ret == EEXIST && !exclusive) { - /* File didn't exist before, but now it does. - * Another process beat us to the punch in creating it. - * Read its metadata to update our cache. */ - ret = unifyfs_get_global_file_meta(client, gfid, &gfattr); - if (ret == UNIFYFS_SUCCESS) { - if (found_local) { - /* TODO: check that global metadata is consistent with - * our existing local entry */ + if (ret == EEXIST) { + LOGINFO("Attempt to create existing file %s (fid:%d)", path, fid); + if (!exclusive) { + /* File didn't exist before, but now it does. + * Another process beat us to the punch in creating it. + * Read its metadata to update our cache. */ + rc = unifyfs_get_global_file_meta(client, gfid, &gfattr); + if (rc == UNIFYFS_SUCCESS) { + /* check for truncate if the file exists already */ + if ((flags & O_TRUNC) && open_for_write) { + if (gfattr.is_laminated) { + ret = EROFS; + } else { + need_truncate = 1; + } + } + + /* append writes are ok on existing files too */ + if ((flags & O_APPEND) && open_for_write) { + ret = UNIFYFS_SUCCESS; + } + + /* Successful in fetching metadata for existing file. + * Update our local cache using that metadata. */ + unifyfs_fid_update_file_meta(client, fid, &gfattr); + + if (!found_local) { + /* it's ok if another client created the shared file */ + ret = UNIFYFS_SUCCESS; + } + } else { + /* Failed to get metadata for a file that should exist. + * Perhaps it was since deleted. We could try to create + * it again and loop through these steps, but for now + * consider this situation to be an error. */ + LOGERR("Failed to get metadata on existing file %s", path); } - - /* Successful in fetching metadata for existing file. - * Update our local cache using that metadata. */ - unifyfs_fid_update_file_meta(client, fid, &gfattr); } else { - /* Failed to get metadata for a file that should exist. - * Perhaps it was since deleted. We could try to create - * it again and loop through these steps, but for now - * consider this situation to be an error. */ - LOGERR("Failed to get metadata on existing file %s (fid:%d)", - path, fid); - } - - /* check for truncate if the file exists already */ - if ((flags & O_TRUNC) && open_for_write && !gfattr.is_laminated) { - need_truncate = 1; + LOGERR("Failed create of existing private/exclusive file %s", + path); } } if (ret != UNIFYFS_SUCCESS) { - LOGERR("Failed to populate the global meta entry for %s (fid:%d)", - path, fid); if (!found_local) { - /* free fid we just allocated above, + /* free fid resources we just allocated above, * but don't do that by calling fid_unlink */ unifyfs_fid_delete(client, fid); } @@ -596,9 +609,8 @@ int unifyfs_fid_open( } } - /* do we normally update position to EOF with O_APPEND? */ + /* for appends, update position to EOF */ if ((flags & O_APPEND) && open_for_write) { - /* We only support O_APPEND on non-laminated files */ pos = unifyfs_fid_logical_size(client, fid); } diff --git a/client/src/unifyfs_fid.h b/client/src/unifyfs_fid.h index 302a4212d..2ea56e987 100644 --- a/client/src/unifyfs_fid.h +++ b/client/src/unifyfs_fid.h @@ -35,7 +35,7 @@ int unifyfs_fid_free(unifyfs_client* client, * Returns the new fid, or negative error value */ int unifyfs_fid_create_file(unifyfs_client* client, const char* path, - int exclusive); + int private); /* Add a new directory and initialize metadata. * Returns the new fid, or a negative error value */ diff --git a/t/api/create-open-remove.c b/t/api/create-open-remove.c index 4b2f69544..06ebd4eb8 100644 --- a/t/api/create-open-remove.c +++ b/t/api/create-open-remove.c @@ -32,6 +32,7 @@ int api_create_open_remove_test(char* unifyfs_root, int t2_flags = 0; unifyfs_gfid t1_gfid = UNIFYFS_INVALID_GFID; unifyfs_gfid t2_gfid = UNIFYFS_INVALID_GFID; + unifyfs_gfid dummy_gfid = UNIFYFS_INVALID_GFID; int rc = unifyfs_create(*fshdl, t1_flags, testfile1, &t1_gfid); ok(rc == UNIFYFS_SUCCESS && t1_gfid != UNIFYFS_INVALID_GFID, @@ -43,6 +44,11 @@ int api_create_open_remove_test(char* unifyfs_root, "%s:%d unifyfs_create(%s) is successful: rc=%d (%s)", __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + rc = unifyfs_create(*fshdl, t2_flags, testfile2, &dummy_gfid); + ok(rc != UNIFYFS_SUCCESS && dummy_gfid == UNIFYFS_INVALID_GFID, + "%s:%d unifyfs_create(%s) for existing file fails: rc=%d (%s)", + __FILE__, __LINE__, testfile2, rc, unifyfs_rc_enum_description(rc)); + diag("Finished API create tests"); //------------- From 118772afd1d5b8ef1909a071bcb3a31ed0c15a28 Mon Sep 17 00:00:00 2001 From: CamStan Date: Mon, 27 Jun 2022 18:59:58 -0700 Subject: [PATCH 73/81] Docs: New limitations and verifyio docs plus edits Adjust the table of contents layout by moving documentation containing additional information to the middle Reference section in order to keep the User Guide less cluttered. Moved the Contributing section to the bottom. New documentation: - Limitation and Workarounds of UnifyFS - How to Use VerifyIO to Determine Compatibility with UnifyFS Additional Edits: - Add a caveat about skipping the header and mount/unmount APIs when using the auto-mount feature - Link to VerifyIO under Semantics section - Add mochi-margo warning for Thallium/Mochi Suite/SDS Repo users - Place Developer Documentation under new top-level header to it appears on the table of contents - Add reference section on UnifyFS Error Codes - Minor wording and formatting edits --- docs/api.rst | 26 +++- docs/assumptions.rst | 40 +++-- docs/build.rst | 66 ++++---- docs/contribute-ways.rst | 22 ++- docs/dependencies.rst | 34 +++- docs/examples.rst | 10 +- docs/index.rst | 18 +-- docs/limitations.rst | 186 ++++++++++++++++++++++ docs/verifyio.rst | 327 +++++++++++++++++++++++++++++++++++++++ 9 files changed, 648 insertions(+), 81 deletions(-) create mode 100644 docs/limitations.rst create mode 100644 docs/verifyio.rst diff --git a/docs/api.rst b/docs/api.rst index 51778ed39..79d593f63 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -6,7 +6,7 @@ This section describes how to use the UnifyFS API in an application. .. Attention:: **Fortran Compatibility** - ``unifyfs_mount`` and ``unifyfs_unmount`` are now usable with GFortran. + ``unifyfs_mount`` and ``unifyfs_unmount`` are usable with GFortran. There is a known ifort_issue_ with the Intel Fortran compiler as well as an xlf_issue_ with the IBM Fortran compiler. Other Fortran compilers are currently unknown. @@ -15,9 +15,19 @@ This section describes how to use the UnifyFS API in an application. include the ``+fortran`` variant, or configure UnifyFS with the ``--enable-fortran`` option if building manually. ---------------------------- -Include the UnifyFS header ---------------------------- +.. rubric:: Transparent Mount Caveat + +MPI applications that take advantage of the :ref:`transparent mounting +` feature (through configuring with ``--enable-mpi-mount`` or +with ``+auto-mount`` through Spack) do not need to be modified in any way in +order to use UnifyFS. Move on to the :doc:`link` section next as this step can +be skipped. + +----- + +-------------------------- +Include the UnifyFS Header +-------------------------- In C or C++ applications, include ``unifyfs.h``. See writeread.c_ for a full example. @@ -35,9 +45,9 @@ full example. include 'unifyfsf.h' ---------------------------- +-------- Mounting ---------------------------- +-------- UnifyFS implements a file system in user space, which the system has no knowledge about. The UnifyFS library intecepts and handles I/O calls whose path matches a prefix that is defined by the user. @@ -65,9 +75,9 @@ Here, ``/unifyfs`` is the path prefix for UnifyFS to intercept. The ``rank`` parameter specifies the MPI rank of the calling process. The ``size`` parameter specifies the number of MPI ranks in the user job. ---------------------------- +---------- Unmounting ---------------------------- +---------- When the application is done using UnifyFS, it should call ``unifyfs_unmount``. diff --git a/docs/assumptions.rst b/docs/assumptions.rst index 10eea9f76..36772c945 100644 --- a/docs/assumptions.rst +++ b/docs/assumptions.rst @@ -1,13 +1,13 @@ -================ +========================= Assumptions and Semantics -================ +========================= In this section, we provide assumptions we make about the behavior of applications that use UnifyFS and about the file system semantics of UnifyFS. ---------------------------- +------------------- System Requirements ---------------------------- +------------------- The system requirements to run UnifyFS are: @@ -17,9 +17,11 @@ The system requirements to run UnifyFS are: - The system must support the ability for UnifyFS user-level server processes to run concurrently with user application processes on compute nodes. ---------------------------- +---------- + +-------------------- Application Behavior ---------------------------- +-------------------- UnifyFS is specifically designed to support the bulk synchronous I/O pattern that is typical in HPC applications, e.g., checkpoint/restart or output dumps. @@ -40,12 +42,13 @@ the performance might be slower and the user may have to take additional steps to ensure correct execution of the application with UnifyFS. For more information on this topic, refer to the section on -:ref:`commit consistency semantics in UnifyFS `_. +:ref:`commit consistency semantics in UnifyFS `. +---------- ---------------------------- +----------------- Consistency Model ---------------------------- +----------------- The UnifyFS file system does not support strict POSIX consistency semantics. (Please see @@ -58,10 +61,10 @@ modified by the application. These two consistency models provide opportunities for UnifyFS to provide better performance for the I/O operations of HPC applications. -''''''''''''''''''''''''''' -Commit Consistency Semantics in UnifyFS -''''''''''''''''''''''''''' .. _commit_consistency_label: +''''''''''''''''''''''''''''''''''''''' +Commit Consistency Semantics in UnifyFS +''''''''''''''''''''''''''''''''''''''' Commit consistency semantics require explicit "commit" operations to be performed before updates to a file @@ -153,11 +156,12 @@ and multiple processes write concurrently to the same file offset or to an overlapping region, the result is undefined and may reflect the result of a mixture of the processes' operations to that offset or region. -.. How can users check that their application is correctly synchronized? Will we have the checker scripts ready? +The :doc:`VerifyIO ` tool can be used to determine whether an +application is correctly synchronized. -''''''''''''''''''''''''''' +''''''''''''''''''''''''''''''''''''''''''' Lamination Consistency Semantics in UnifyFS -''''''''''''''''''''''''''' +''''''''''''''''''''''''''''''''''''''''''' The other consistency model that UnifyFS employs is called "lamination semantics" which is intended to be applied once a file is done being modified @@ -202,9 +206,11 @@ metadata and file data between compute nodes on every write. Also, since file contents cannot change after lamination, aggressive caching may be used during the read phase with minimal locking. ---------------------------- +---------- + +-------------------- File System Behavior ---------------------------- +-------------------- The following summarize the behavior of UnifyFS under our consistency model. diff --git a/docs/build.rst b/docs/build.rst index 961e4d1b4..aac26a5a4 100644 --- a/docs/build.rst +++ b/docs/build.rst @@ -1,26 +1,34 @@ -======================== +============= Build UnifyFS -======================== +============= This section describes how to build UnifyFS and its dependencies. There are three options: -* build both UnifyFS and its dependencies with Spack, +* build both UnifyFS and dependencies with Spack, * build the dependencies with Spack, but build UnifyFS with autotools * build the dependencies with a bootstrap script, and build UnifyFS with autotools ---------------------------- +---------- ---------------------------------------------- -Build UnifyFS and its dependencies with Spack ---------------------------------------------- +----------------------------------------- +Build UnifyFS and Dependencies with Spack +----------------------------------------- One may install UnifyFS and its dependencies with Spack_. If you already have Spack, make sure you have the latest release. If you use a clone of the Spack develop branch, be sure to pull the latest changes. -.. _build-label: +.. warning:: **Thallium, Mochi Suite, and SDS Repo Users** + + The available and UnifyFS-compatible Mochi-Margo versions that are in the + ``mochi-margo`` Spack package do not match up with the latest/default + versions in the Mochi Suite, SDS Repo, and ``mochi-thallium`` Spack + packages. It is likely that a different version of ``mochi-margo`` will need + to be specified in the install command of UnifyFS (E.g.: ``spack install + unifyfs ^mochi-margo@0.9.6``). +.. _build-label: Install Spack ************* @@ -33,8 +41,8 @@ Install Spack Use `Spack's shell support`_ to add Spack to your ``PATH`` and enable use of the ``spack`` command. -Build UnifyFS and its dependencies -********************************** +Build and Install UnifyFS +************************* .. code-block:: Bash @@ -44,11 +52,9 @@ Build UnifyFS and its dependencies If the most recent changes on the development branch ('dev') of UnifyFS are desired, then do ``spack install unifyfs@develop``. -.. Edit the following admonition if the default of variants are changed or when - new variants are added. - Include or remove variants with Spack when installing UnifyFS when a custom -build is desired. Run ``spack info unifyfs`` for more info. +build is desired. Run ``spack info unifyfs`` for more information on available +variants. .. table:: UnifyFS Build Variants :widths: auto @@ -58,9 +64,6 @@ build is desired. Run ``spack info unifyfs`` for more info. (``spack install ``) ========== ============================= ======= =========================== Auto-mount ``unifyfs+auto-mount`` True Enable transparent mounting - HDF5 ``unifyfs+hdf5`` False Build with parallel HDF5 - - ``unifyfs+hdf5 ^hdf5~mpi`` False Build with serial HDF5 Fortran ``unifyfs+fortran`` False Enable Fortran support PMI ``unifyfs+pmi`` False Enable PMI2 support PMIx ``unifyfs+pmix`` False Enable PMIx support @@ -78,10 +81,10 @@ build is desired. Run ``spack info unifyfs`` for more info. Run ``spack spec -I unifyfs`` before installing to see what Spack is going to do. ---------------------------- +---------- ----------------------------------------------------------- -Build dependencies with Spack, build UnifyFS with autotools +Build Dependencies with Spack, Build UnifyFS with Autotools ----------------------------------------------------------- One can install the UnifyFS dependencies with Spack and build UnifyFS @@ -92,7 +95,7 @@ Take advantage of `Spack Environments`_ to streamline this process. .. _spack-build-label: -Build the dependencies +Build the Dependencies ********************** Once Spack is installed on your system (see :ref:`above `), @@ -101,13 +104,13 @@ the UnifyFS dependencies can then be installed. .. code-block:: Bash $ spack install gotcha - $ spack install mochi-margo ^libfabric fabrics=rxm,sockets,tcp + $ spack install mochi-margo@0.9.6 ^libfabric fabrics=rxm,sockets,tcp $ spack install spath~mpi .. tip:: - You can run ``spack install --only=dependencies unifyfs`` to install all - UnifyFS dependencies without installing UnifyFS. + Run ``spack install --only=dependencies unifyfs`` to install all UnifyFS + dependencies without installing UnifyFS itself. Keep in mind this will also install all the build dependencies and dependencies of dependencies if you haven't already installed them through @@ -118,7 +121,7 @@ Build UnifyFS ************* Download the latest UnifyFS release from the Releases_ page or clone the develop -branch from the UnifyFS repository +branch ('dev') from the UnifyFS repository `https://github.com/LLNL/UnifyFS `_. Load the dependencies into your environment and then @@ -149,21 +152,21 @@ Alternatively, UnifyFS can be configured using ``CPPFLAGS`` and ``LDFLAGS``: To see all available build configuration options, run ``./configure --help`` after ``./autogen.sh`` has been run. ---------------------------- +---------- ------------------------------------------------------------------ -Build dependencies with bootstrap and build UnifyFS with autotools +Build Dependencies with Bootstrap and Build UnifyFS with Autotools ------------------------------------------------------------------ Download the latest UnifyFS release from the Releases_ page or clone the develop -branch from the UnifyFS repository +branch ('dev') from the UnifyFS repository `https://github.com/LLNL/UnifyFS `_. Build the Dependencies ********************** UnifyFS requires MPI, GOTCHA, Margo and OpenSSL. -References to these dependencies can be found on our :doc:`dependencies` page. +References to these dependencies can be found on the :doc:`dependencies` page. A bootstrap.sh_ script in the UnifyFS source distribution downloads and installs all dependencies. Simply run the script in the top level directory of the @@ -212,14 +215,14 @@ after ``./autogen.sh`` has been run. On Cray systems, the detection of MPI compiler wrappers requires passing the following flags to the configure command: ``MPICC=cc MPIFC=ftn`` ---------------------------- +---------- ----------------- Configure Options ----------------- -When building UnifyFS with autotools, -a number of options are available to configure its functionality. +When building UnifyFS with autotools, a number of options are available to +configure its functionality. Fortran ******* @@ -272,6 +275,7 @@ relative paths within an application. To enable, use the ``--with-spath`` configure option or provide the appropriate ``CPPFLAGS`` and ``LDFLAGS`` at configure time. +.. _auto-mount-label: Transparent Mounting for MPI Applications ***************************************** diff --git a/docs/contribute-ways.rst b/docs/contribute-ways.rst index d0a22324f..b774db296 100644 --- a/docs/contribute-ways.rst +++ b/docs/contribute-ways.rst @@ -1,5 +1,5 @@ ****************** -Ways to Contribute +Contributing Guide ****************** *First of all, thank you for taking the time to contribute!* @@ -102,6 +102,18 @@ able to quickly identify and resolve issues. Documentation ============= +As UnifyFS is continually improved and updated, it is easy for documentation to +become out-of-date. Any contributions to the documentation, no matter how +small, is always greatly appreciated. If you are not in a position to update +the documentation yourself, please notify us via the `mailing list`_ of +anything you notice that is missing or needs to be changed. + +--------------- + +*********************** +Developer Documentation +*********************** + Here is our current documentation of how the internals of UnifyFS function for several basic operations. @@ -113,16 +125,10 @@ several basic operations. :align: left :alt: UnifyFS Developer's Documentation -:download:`Download slides `. +:download:`Download PDF `. | -As UnifyFS is continually improved and updated, it is easy for documentation to -become out-of-date. Any contributions to the documentation, no matter how -small, is always greatly appreciated. If you are not in a position to update -the documentation yourself, please notify us via the `mailing list`_ of -anything you notice that needs to be changed. - .. explicit external hyperlink targets .. _mailing list: ecp-unifyfs@exascaleproject.org diff --git a/docs/dependencies.rst b/docs/dependencies.rst index 9cd0ae400..2422c145a 100644 --- a/docs/dependencies.rst +++ b/docs/dependencies.rst @@ -10,12 +10,12 @@ Required - `GOTCHA `_ version 1.0.3 (or later) -- `Margo `_ version 0.9.6 (or later) and its dependencies: +- `Margo `_ version 0.9.6 - version 0.9.9 and its dependencies: - `Argobots `_ version 1.1 (or later) - `Mercury `_ version 2.0.1 (or later) - - `libfabric `_ or `bmi `_ + - `libfabric `_ (avoid versions 1.13 and 1.13.1) or `bmi `_ - `JSON-C `_ @@ -33,3 +33,33 @@ Optional -------- - `spath `_ for normalizing relative paths + +---------- + +=================== +UnifyFS Error Codes +=================== + +Wherever sensible, UnifyFS uses the error codes defined in POSIX `errno.h +`_. + +UnifyFS specific error codes are defined as follows: + +.. table:: + :widths: auto + + ===== ========= ====================================== + Value Error Description + ===== ========= ====================================== + 1001 BADCONFIG Configuration has invalid setting + 1002 GOTCHA Gotcha operation error + 1003 KEYVAL Key-value store operation error + 1004 MARGO Mercury/Argobots operation error + 1005 MDHIM MDHIM operation error + 1006 META Metadata store operation error + 1007 NYI Not yet implemented + 1008 PMI PMI2/PMIx error + 1009 SHMEM Shared memory region init/access error + 1010 THREAD POSIX thread operation failed + 1011 TIMEOUT Timed out + ===== ========= ====================================== diff --git a/docs/examples.rst b/docs/examples.rst index 74a918b33..7cf2631b6 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -1,14 +1,14 @@ -******** -Examples -******** +**************** +Example Programs +**************** There are several examples_ available on ways to use UnifyFS. These examples build into static, GOTCHA, and pure POSIX (not linked with UnifyFS) versions depending on how they are linked. Several of the example programs are also used in the UnifyFS :doc:`intregraton testing `. -Examples Locations -================== +Locations of Examples +===================== The example programs can be found in two locations, where UnifyFS is built and where UnifyFS is installed. diff --git a/docs/index.rst b/docs/index.rst index cf4cc4979..b640310be 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -14,13 +14,21 @@ UnifyFS: A file system for burst buffers overview definitions assumptions + limitations build api link configuration run + +.. toctree:: + :maxdepth: 2 + :caption: Reference + examples library_api + dependencies + verifyio .. toctree:: :maxdepth: 2 @@ -32,16 +40,6 @@ UnifyFS: A file system for burst buffers wrappers add-rpcs -.. toctree:: - :maxdepth: 2 - :caption: Dependencies - -.. toctree:: - :maxdepth: 2 - :caption: Reference - - dependencies - ================== Indices and tables ================== diff --git a/docs/limitations.rst b/docs/limitations.rst new file mode 100644 index 000000000..dfde0c5c3 --- /dev/null +++ b/docs/limitations.rst @@ -0,0 +1,186 @@ +=========================== +Limitations and Workarounds +=========================== + +------------------- +General Limitations +------------------- + +.. rubric:: Synchronization Across Processes + +Any overlapping write operations or simultaneous read and write operations +require proper synchronization within UnifyFS. This includes ensuring updates to +a file are visible by other processes as well as proper inter-process +communication to enforce ordering of conflicting I/O operations. Refer to the +section on +:ref:`commit consistency semantics in UnifyFS ` +for more detail. + +.. rubric:: File Locking + +UnifyFS does not support file locking (calls to ``fcntl()``). This results in +some less obvious limitations when using some I/O libraries with UnifyFS +(:ref:`see below `). + +.. warning:: + + Any calls to ``fcntl()`` are not even intercepted by UnifyFS and calls to + such will be ignored, resulting in silent errors and possible data + corruption. Running the application through :doc:`VerifyIO ` can + help determine if any file locking calls are made. + +---------- + +--------------------------- +MPI-IO and HDF5 Limitations +--------------------------- + +Synchronization +*************** + +Applications that make use of inter-process communication (e.g., MPI) to enforce +a particular order of potentially conflicting I/O operations from multiple +processes must properly use synchronization calls (e.g., ``MPI_File_sync()``) to +avoid possible data corruption. Within UnifyFS, this requires use of the +:ref:`"sync-barrier-sync" construct ` to ensure proper +synchronization. + +Properly synchronizing includes doing so between any less obvious MPI-I/O calls +that also imply a write/read. For example, ``MPI_File_set_size()`` and +``MPI_File_preallocate()`` act as write operations and ``MPI_File_get_size()`` +acts as a read operation. There may be other implied write/read calls as well. + +.. TODO: Mention use/need of ``romio_visibility_immediate`` hint once available. +.. https://github.com/pmodels/mpich/issues/5902 + +Synchronization Workarounds +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If your application does not adhere to proper syncronization requirements there +are four workaround options available to still allow UnifyFS intregration. + +UnifyFS Sync-per-write Configuration (``client.write_sync``) +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +UnifyFS provided config option that makes UnifyFS act more “POSIX like” by +forcing a metadata sync to the server after **every** write operation. Set +``UNIFYFS_CLIENT_WRITE_SYNC=ON`` to enable this option. + +**Cost:** Can cause a significant decrease in write performance as the amount of +file sync operations that are performed will be far more than necessary. + +Manually Add Sync Operations +"""""""""""""""""""""""""""" + +Edit the application code and manually add the proper synchronization calls +everywhere necessary. For example, wherever an ``MPI_File_sync()`` is required, +the "sync-barrier-sync" construct needs to be used. + +.. _sync-barrier-sync-label: + +.. code-block:: C + :caption: Sync-barrier-sync Construct + + MPI_File_sync() //flush newly written bytes from MPI library to file system + MPI_Barrier() //ensure all ranks have finished the previous sync + MPI_File_sync() //invalidate read cache in MPI library + +.. Note:: + + The "barrier" in "sync-barrier-sync" can be replaced by a send-recv or + certain collectives that are guaranteed to be synchronized. See the "Note on + the third step" in the `VerifyIO README`_ for more information. + +**Cost:** If making edits to the application source code is an option, the +amount of time and effort required to track down all the places that proper +synchonization calls are needed can be very labor intensive. +:doc:`VerifyIO ` can help with in this effort. + +HDF5 FILE_SYNC +"""""""""""""" + +HDF5 provided config option that forces HDF5 to add an ``MPI_File_sync()`` call +after every collective write operation when needed by the underlying MPI-IO +driver. Set ``HDF5_DO_MPI_FILE_SYNC=1`` to enable this option. + +.. Note:: + + This option will soon be available in the `HDF5 develop branch`_ as well as + in the next HDF5 release. + +**Cost:** Can cause a significant decrease in write performance as the amount of +file sync operations performed will likely be more than necessary. Similar to, +but potentially more efficient than, the ``WRITE_SYNC`` workaround as less +overall file syncs may be performed in comparision, but still likely more than +needed. + +ROMIO Driver Hint +""""""""""""""""" + +A ROMIO provided hint that will cause the ROMIO driver (in a supported MPI +library) to add an ``MPI_Barrier()`` call and an additional ``MPI_File_sync()`` +call after each already existing ``MPI_File_sync()`` call within the +application. In other words, this hint converts each existing +``MPI_File_sync()`` call into the "sync-barrier-sync" construct. Enable the +``romio_synchronizing_flush`` hint to use this workaround. + +**Cost:** Potentially more efficient that the ``WRITE_SYNC`` and HDF5 +``FILE_SYNC`` workarounds as this will cause the application to use the +synchronization construct required by UnifyFS everywhere the application already +intends them to occur (i.e., whenever there is already an ``MPI_File_sync()``). +However, if (1) any existing ``MPI_File_sync()`` calls are only meant to make +data visible to the other processes (rather than to avoid potential conflicts) +or (2) the application contains a mix of lone ``MPI_File_sync()`` calls along +with the "sync-barrier-sync" construct, then this approach will result in more +syncs than necessary. + +---------- + +.. _file-lock-label: +File Locking +************ + +UnifyFS not supporting file locks results in some I/O library features to not +work with UnifyFS. + +.. topic:: Atomicity + + ROMIO uses ``fcntl()`` to implement atomicity. It is recommended to disable + atomicity when integrating with UnifyFS. To disable, run + ``MPI_File_set_atomicity(fh, 0)``. + +.. topic:: Data Sieving + + It is recommended to disable data sieving when integrating with UnifyFS. + Even with locking support, use of data sieving will drastically increase the + time and space overhead within UnifyFS, significantly decreasing application + performance. For ROMIO, set the hints ``romio_ds_write disable`` and + ``romio_ds_read disable`` to disable data sieving. + +.. topic:: Shared File Pointers + + Avoid using shared file pointers in MPI-I/O under UnifyFS as they require + file locking to implement. + Functions that use shared file pointers include: + + - ``MPI_File_write_shared()`` + - ``MPI_File_read_shared()`` + - ``MPI_File_write_ordered()`` + - ``MPI_File_read_ordered()`` + +File Locking Workarounds +^^^^^^^^^^^^^^^^^^^^^^^^ + +UnifyFS doesn't provide any direct workarounds for anything that requires file +locking. Simply disable atomicity and data sieving and avoid using shared file +pointers to get around this. + +In the future, UnifyFS may add support for file locking. However, it is strongly +suggested to avoid file locking unless the application cannot run properly +without its use. Enabling file lock support within UnifyFS will result in +decreased I/O performance for the application. + +.. explicit external hyperlink targets + +.. _HDF5 develop branch: https://github.com/HDFGroup/hdf5 +.. _VerifyIO README: https://github.com/uiuc-hpc/Recorder/tree/pilgrim/tools/verifyio#note-on-the-third-step diff --git a/docs/verifyio.rst b/docs/verifyio.rst new file mode 100644 index 000000000..b1466ba21 --- /dev/null +++ b/docs/verifyio.rst @@ -0,0 +1,327 @@ +========================================= +VerifyIO: Determine UnifyFS Compatibility +========================================= + +---------------------- +Recorder and VerifyIO +---------------------- + +VerifyIO_ can be used to determine an application's compatibility with UnifyFS +as well as aid in narrowing down what an application may need to change to +become compatible with UnifyFS. + +VerifyIO is a tool within the Recorder_ tracing framework that takes the +application traces from Recorder and determines whether I/O synchronization is +correct based on the underlying file system semantics (e.g., POSIX, commit) and +synchronization semantics (e.g., POSIX, MPI). + +Run VerifyIO with commit semantics on the application's traces to determine +compatibility with UnifyFS. + +---------- + +-------------- +VerifyIO Guide +-------------- + +To use VerifyIO, the Recorder library needs to be installed. See the `Recorder +README`_ for full instructions on how to build, run, and use Recorder. + +Build +***** + +Clone the ``pilgrim`` (default) branch of Recorder: + +.. code-block:: Bash + :caption: Clone + + $ git clone https://github.com/uiuc-hpc/Recorder.git + +Determine the install locations of the MPI-IO and HDF5 libraries being used by +the application and pass those paths to Recorder at configure time. + +.. code-block:: Bash + :caption: Configure, Make, and Install + + $ deps_prefix="${mpi_install};${hdf5_install}" + $ mkdir -p build install + + $ cd build + $ cmake -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_PREFIX_PATH=$deps_prefix ../Recorder + $ make + $ make install + + # Capture Recorder source code and install locations + $ export RECORDER_SRC=/path/to/Recorder/source/code + $ export RECORDER_ROOT=/path/to/Recorder/install + +Python3 and the ``recorder-viz`` and ``networkx`` packages are also required to +run the final VerifyIO verification code. + +.. code-block:: Bash + :caption: Install Python Packages + + $ module load python/3.x.x + $ + $ pip3 install recorder-viz --user + $ pip3 install networkx --user + +Run +*** + +Before capturing application traces, it is recommended to disable data sieving +as VerifyIO will flag this as incompatible under commit semantics. + +.. code-block:: Bash + :caption: Disable Data Sieving + + echo -e "romio_ds_write disable\nromio_ds_read disable" > /path/to/romio_hints + export ROMIO_HINTS=/path/to/romio_hints + export ROMIO_PRINT_HINTS=1 #optional + + +Run the application with Recorder to capture the traces using the appropriate +environment variable export option for the available workload manager. + +.. code-block:: Bash + :caption: Capture Traces + + srun -N $nnodes -n $nprocs --export=ALL,LD_PRELOAD=$RECORDER_ROOT/lib/librecorder.so example_app_executable + +Recorder places the trace files in a folder within the current working directory +named ``hostname-username-appname-pid-starttime``. + +.. _recorder2text-label: +If desired (e.g., for debugging), use the recorder2text tool to generate +human-readable traces from the captured trace files. + +.. code-block:: Bash + :caption: Generate Human-readable Traces + + $RECORDER_ROOT/bin/recorder2text /path/to/traces &> recorder2text.out + +This will generate text-format traces in the folder ``path/to/traces/_text``. + +Next, run the Recorder conflict detector to capture **potential** conflicts. The +``--semantics=`` option needs to match the semantics of the intended underlying +file system. In the case of UnifyFS, use ``commit`` semantics. + +.. code-block:: Bash + :caption: Capture Potential Conflicts + + $RECORDER_ROOT/bin/conflict_detector /path/to/traces --semantics=commit &> conflict_detector_commit.out + +The potential conflicts will be recorded to the file +``path/to/traces/conflicts.txt``. + +Lastly, run VerifyIO with the traces and potential conflicts to determine +whether all I/O operations are properly synchronized under the desired standard +(e.g., POSIX, MPI). + +.. code-block:: Bash + :caption: Run VerifyIO + + # Evaluate using POSIX standard + python3 $RECORDER_SRC/tools/verifyio/verifyio.py /path/to/traces /path/to/traces/conflicts.txt --semantics=posix &> verifyio_commit_results.posix + + # Evaluate using MPI standard + python3 $RECORDER_SRC/tools/verifyio/verifyio.py /path/to/traces /path/to/traces/conflicts.txt --semantics=mpi &> verifyio_commit_results.mpi + +Interpreting Results +******************** + +In the event VerifyIO shows an incompatibility, or the results are not clear, +don't hesitate to contact the UnifyFS team `mailing list`_ for aid in +determining a solution. + +Conflict Detector Results +^^^^^^^^^^^^^^^^^^^^^^^^^ + +When there are no potential conflicts, the conflict detector output simply +states as much: + +.. code-block:: none + + [prompt]$ cat conflict_detector_commit.out + Check potential conflicts under Commit Semantics + ... + No potential conflict found for file /path/to/example_app_outfile + +When potential conflicts exist, the conflict detector prints a list of each +conflicting pair. For each operation within a pair, the output contains the +process rank, sequence ID, offset the conflict occurred at, number of bytes +affected by the operation, and whether the operation was a write or a read. +This format is printed at the top of the output. + +.. code-block:: none + + [prompt]$ cat conflict_detector_commit.out + Check potential conflicts under Commit Semantics + Format: + Filename, io op1(rank-seqId, offset, bytes, isRead), io op2(rank-seqId, offset, bytes, isRead) + + /path/to/example_app_outfile, op1(0-244, 0, 800, write), op2(0-255, 0, 96, write) + /path/to/example_app_outfile, op1(0-92, 4288, 2240, write), op2(0-148, 4288, 2216, read) + /path/to/example_app_outfile, op1(1-80, 6528, 2240, write), op2(1-136, 6528, 2216, read) + ... + /path/to/example_app_outfile, op1(0-169, 18480, 4888, write), op2(3-245, 18848, 14792, read) + /path/to/example_app_outfile, op1(0-169, 18480, 4888, write), op2(3-246, 18848, 14792, write) + /path/to/example_app_outfile, op1(0-231, 18480, 16816, write), op2(3-245, 18848, 14792, read) + /path/to/example_app_outfile, Read-after-write (RAW): D-2,S-5, Write-after-write (WAW): D-1,S-2 + +The final line printed contains a summary of all the potential conflicts. +This consists of the total number of read-after-write (RAW) and +write-after-write (WAW) potentially conflicting operations performed by +different processes (D-#) or the same process (S-#). + +VerifyIO Results +^^^^^^^^^^^^^^^^ + +VerifyIO takes the traces and potential conflicts and checks if each conflicting pair is properly synchronized. Refer to the `VerifyIO README `_ for a +description on what determines proper synchronization for a conflicting I/O +pair. + +Compatible with UnifyFS +""""""""""""""""""""""" + +In the event that there are no potential conflicts, or each potential conflict +pair was performed by the same rank, VerifyIO will report the application as +being properly synchronized and therefore compatible with UnifyFS. + +.. code-block:: none + + [prompt]$ cat verifyio_commit_results.posix + Rank: 0, intercepted calls: 79, accessed files: 5 + Rank: 1, intercepted calls: 56, accessed files: 2 + Building happens-before graph + Nodes: 46, Edges: 84 + + Properly synchronized under posix semantics + + + [prompt]$ cat verifyio_commit_results.mpi + Rank: 0, intercepted calls: 79, accessed files: 5 + Rank: 1, intercepted calls: 56, accessed files: 2 + Building happens-before graph + Nodes: 46, Edges: 56 + + Properly synchronized under mpi semantics + +When there are potential conflicts from different ranks but the proper +synchronization has occurred, VerifyIO will also report the application as being +properly synchronized. + +.. code-block:: none + + [prompt]$ cat verifyio_commit_results.posix + Rank: 0, intercepted calls: 510, accessed files: 8 + Rank: 1, intercepted calls: 482, accessed files: 5 + Rank: 2, intercepted calls: 481, accessed files: 5 + Rank: 3, intercepted calls: 506, accessed files: 5 + Building happens-before graph + Nodes: 299, Edges: 685 + Conflicting I/O operations: 0-169-write <--> 3-245-read, properly synchronized: True + Conflicting I/O operations: 0-169-write <--> 3-246-write, properly synchronized: True + + Properly synchronized under posix semantics + +Incompatible with UnifyFS +""""""""""""""""""""""""" + +In the event there are potential conflicts from different ranks but the proper +synchronization has **not** occurred, VerifyIO will report the application as +not being properly synchronized and therefore incompatible [*]_ with UnifyFS. + +Each operation involved in the conflicting pair is listed in the format +``rank-sequenceID-operation`` followed by the whether that pair is properly +synchronized. + +.. code-block:: none + + [prompt]$ cat verifyio_commit_results.mpi + Rank: 0, intercepted calls: 510, accessed files: 8 + Rank: 1, intercepted calls: 482, accessed files: 5 + Rank: 2, intercepted calls: 481, accessed files: 5 + Rank: 3, intercepted calls: 506, accessed files: 5 + Building happens-before graph + Nodes: 299, Edges: 427 + 0-169-write --> 3-245-read, properly synchronized: False + 0-169-write --> 3-246-write, properly synchronized: False + + Not properly synchronized under mpi semantics + +.. [*] Incompatible here does not mean the application cannot work with UnifyFS + at all, just under the default configuration. There are + :doc:`workarounds ` available that could very easily change this + result (VerifyIO plans to have options to run under the assumption some + workarounds are in place). Should your outcome be an incompatible result, + please contact the UnifyFS `mailing list`_ for aid in finding a solution. + +.. rubric:: Debugging a Conflict + +The :ref:`recorder2text output ` can be used to aid in +narrowing down where/what is causing a conflicting pair. In the incompatible +example above, the first pair is a ``write()`` from rank 0 with the sequence ID +of 169 and a ``read()`` from rank 3 with the sequence ID of 245. + +The sequence IDs correspond to the order in which functions were called by that +particular rank. In the recorder2text output, this ID will then correspond to +line numbers, but off by +1 (i.e., seqID 169 -> line# 170). + +.. code-block:: none + :caption: recorder2text output + :emphasize-lines: 6,14 + + #rank 0 + ... + 167 0.1440291 0.1441011 MPI_File_write_at_all 1 1 ( 0-0 0 %p 1 MPI_TYPE_UNKNOWN [0_0] ) + 168 0.1440560 0.1440679 fcntl 2 0 ( /path/to/example_app_outfile 7 1 ) + 169 0.1440700 0.1440750 pread 2 0 ( /path/to/example_app_outfile %p 4888 18480 ) + 170 0.1440778 0.1440909 pwrite 2 0 ( /path/to/example_app_outfile %p 4888 18480 ) + 171 0.1440918 0.1440987 fcntl 2 0 ( /path/to/example_app_outfile 6 2 ) + ... + + #rank 3 + ... + 244 0.1539204 0.1627174 MPI_File_write_at_all 1 1 ( 0-0 0 %p 1 MPI_TYPE_UNKNOWN [0_0] ) + 245 0.1539554 0.1549513 fcntl 2 0 ( /path/to/example_app_outfile 7 1 ) + 246 0.1549534 0.1609544 pread 2 0 ( /path/to/example_app_outfile %p 14792 18848 ) + 247 0.1609572 0.1627053 pwrite 2 0 (/path/to/example_app_outfile %p 14792 18848 ) + 248 0.1627081 0.1627152 fcntl 2 0 ( /path/to/example_app_outfile 6 2 ) + ... + +Note that in this example the ``pread()``/``pwrite()`` calls from rank 3 operate +on overlapping bytes from the ``pwrite()`` call from rank 0. For this example, +data sieving was left enabled which results in "fcntl-pread-pwrite-fcntl" I/O +sequences. Refer to :doc:`limitations` for more on the file locking limitations +of UnifyFS. + +The format of the recorder2text output is: `` + (func-parameters)`` + +.. Note:: + + The ```` value indicates whether the function was called + directly by the application or by an I/O library. The ```` value + shows the Recorder-tracked function type. + + +-------+------------------------------------+-+-------+-----------------+ + | Value | Call Level | | Value | Function Type | + +=======+====================================+=+=======+=================+ + | 0 | Called by application directly | | 0 | RECORDER_POSIX | + +-------+------------------------------------+ +-------+-----------------+ + | 1 | - Called by HDF5 | | 1 | RECORDER_MPIIO | + | | - Called by MPI (no HDF5) | +-------+-----------------+ + | | | | 2 | RECORDER_MPI | + +-------+------------------------------------+ +-------+-----------------+ + | 2 | Called by MPI, which was called by | | 3 | RECORDER_HDF5 | + | | HDF5 | +-------+-----------------+ + | | | | 4 | RECORDER_FTRACE | + +-------+------------------------------------+-+-------+-----------------+ + +.. explicit external hyperlink targets + +.. _mailing list: ecp-unifyfs@exascaleproject.org +.. _Recorder: https://github.com/uiuc-hpc/Recorder +.. _Recorder README: https://github.com/uiuc-hpc/Recorder/blob/pilgrim/README.md +.. _VerifyIO: https://github.com/uiuc-hpc/Recorder/tree/pilgrim/tools/verifyio#note-on-the-third-step From 121d8715852168f73bf285b4dcb8fe3b638b925a Mon Sep 17 00:00:00 2001 From: "Michael J. Brim" Date: Mon, 27 Jun 2022 09:58:54 -0400 Subject: [PATCH 74/81] avoid request signal if RM thread exiting --- server/src/unifyfs_request_manager.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/server/src/unifyfs_request_manager.c b/server/src/unifyfs_request_manager.c index 41185f399..36331d603 100644 --- a/server/src/unifyfs_request_manager.c +++ b/server/src/unifyfs_request_manager.c @@ -159,9 +159,9 @@ reqmgr_thrd_t* unifyfs_rm_thrd_create(int app_id, int client_id) thrd_ctrl->client_id = client_id; /* initialize flow control flags */ - thrd_ctrl->exit_flag = 0; - thrd_ctrl->exited = 0; - thrd_ctrl->waiting_for_work = 0; + thrd_ctrl->exit_flag = 0; + thrd_ctrl->exited = 0; + thrd_ctrl->waiting_for_work = 0; /* launch request manager thread */ rc = pthread_create(&(thrd_ctrl->thrd), NULL, @@ -262,7 +262,7 @@ int rm_release_read_req(reqmgr_thrd_t* thrd_ctrl, static void signal_new_requests(reqmgr_thrd_t* reqmgr) { pid_t this_thread = unifyfs_gettid(); - if (this_thread != reqmgr->tid) { + if ((!reqmgr->exit_flag) && (this_thread != reqmgr->tid)) { /* signal reqmgr to begin processing the requests we just added */ LOGDBG("signaling new requests"); pthread_cond_signal(&reqmgr->thrd_cond); @@ -1637,11 +1637,11 @@ void* request_manager_thread(void* arg) rc = rm_heartbeat(thrd_ctrl); if (rc != UNIFYFS_SUCCESS) { /* detected failure of our client, time to exit */ - break; + thrd_ctrl->exit_flag = 1; } /* bail out if we've been told to exit */ - if (thrd_ctrl->exit_flag == 1) { + if (thrd_ctrl->exit_flag) { break; } } From 84187bb029dbfa91644c25986f1aac5b8c2b0f3a Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Tue, 2 Aug 2022 14:00:18 -0700 Subject: [PATCH 75/81] return on separate line for debugging --- common/src/unifyfs_configurator.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/src/unifyfs_configurator.c b/common/src/unifyfs_configurator.c index 39d729da7..7451e6f84 100644 --- a/common/src/unifyfs_configurator.c +++ b/common/src/unifyfs_configurator.c @@ -404,7 +404,8 @@ char* getenv_helper(const char* section, ndx += sprintf(envname + ndx, "_%u", mentry); //fprintf(stderr, "UNIFYFS CONFIG DEBUG: checking env var %s\n", envname); - return getenv(envname); + char* val = getenv(envname); + return val; } From f6084e3346736233027dbb4f6c5df615a0aff78c Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Tue, 2 Aug 2022 14:05:59 -0700 Subject: [PATCH 76/81] server: add config options to set rpc timeouts TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.h" --- common/src/unifyfs_configurator.h | 2 ++ docs/configuration.rst | 12 +++++++----- server/src/margo_server.c | 6 +++++- server/src/margo_server.h | 2 ++ server/src/unifyfs_group_rpc.c | 2 +- server/src/unifyfs_p2p_rpc.c | 2 +- server/src/unifyfs_server.c | 10 ++++++++++ 7 files changed, 28 insertions(+), 8 deletions(-) diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index e187215ad..8c1f147b2 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -85,8 +85,10 @@ UNIFYFS_CFG(logio, spill_size, INT, UNIFYFS_LOGIO_SPILL_SIZE, "log-based I/O spillover file size", NULL) \ UNIFYFS_CFG(logio, spill_dir, STRING, NULLSTRING, "spillover directory", configurator_directory_check) \ UNIFYFS_CFG(margo, client_pool_size, INT, UNIFYFS_MARGO_POOL_SZ, "size of server's ULT pool for client-server RPCs", NULL) \ + UNIFYFS_CFG(margo, client_timeout, INT, UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC, "timeout in milliseconds for client-server RPCs", NULL) \ UNIFYFS_CFG(margo, lazy_connect, BOOL, on, "wait until first communication with server to resolve its connection address", NULL) \ UNIFYFS_CFG(margo, server_pool_size, INT, UNIFYFS_MARGO_POOL_SZ, "size of server's ULT pool for server-server RPCs", NULL) \ + UNIFYFS_CFG(margo, server_timeout, INT, UNIFYFS_MARGO_SERVER_SERVER_TIMEOUT_MSEC, "timeout in milliseconds for server-server RPCs", NULL) \ UNIFYFS_CFG(margo, tcp, BOOL, on, "use TCP for server-to-server margo RPCs", NULL) \ UNIFYFS_CFG(meta, range_size, INT, UNIFYFS_META_DEFAULT_SLICE_SZ, "metadata range size", NULL) \ UNIFYFS_CFG_CLI(runstate, dir, STRING, RUNDIR, "runstate file directory", configurator_directory_check, 'R', "specify full path to directory to contain server-local state") \ diff --git a/docs/configuration.rst b/docs/configuration.rst index 8c083ba90..a61f11c28 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -116,11 +116,13 @@ files. .. table:: ``[margo]`` section - margo server NA settings :widths: auto - === ==== ================================================================================= - Key Type Description - === ==== ================================================================================= - tcp BOOL Use TCP for server-to-server rpcs (default: on, turn off to enable libfabric RMA) - === ==== ================================================================================= + ============== ==== ================================================================================= + Key Type Description + ============== ==== ================================================================================= + tcp BOOL Use TCP for server-to-server rpcs (default: on, turn off to enable libfabric RMA) + client_timeout INT timeout in milliseconds for rpcs between client and server (default: 5000) + server_timeout INT timeout in milliseconds for rpcs between servers (default: 15000) + ============== ==== ================================================================================= .. table:: ``[runstate]`` section - server runstate settings :widths: auto diff --git a/server/src/margo_server.c b/server/src/margo_server.c index dd435798a..1b25215a2 100644 --- a/server/src/margo_server.c +++ b/server/src/margo_server.c @@ -30,6 +30,10 @@ bool margo_use_tcp = true; bool margo_lazy_connect; // = false int margo_client_server_pool_sz = UNIFYFS_MARGO_POOL_SZ; int margo_server_server_pool_sz = UNIFYFS_MARGO_POOL_SZ; +double margo_client_server_timeout_msec = + UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC; +double margo_server_server_timeout_msec = + UNIFYFS_MARGO_SERVER_SERVER_TIMEOUT_MSEC; int margo_use_progress_thread = 1; // records pmi rank, server address string, and server address @@ -657,7 +661,7 @@ static hg_handle_t create_client_handle(hg_id_t id, static int forward_to_client(hg_handle_t hdl, void* input_ptr) { - double timeout_msec = UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC; + double timeout_msec = margo_client_server_timeout_msec; hg_return_t hret = margo_forward_timed(hdl, input_ptr, timeout_msec); if (hret != HG_SUCCESS) { LOGERR("margo_forward_timed() failed - %s", HG_Error_to_string(hret)); diff --git a/server/src/margo_server.h b/server/src/margo_server.h index 36ac113fe..ac6338794 100644 --- a/server/src/margo_server.h +++ b/server/src/margo_server.h @@ -70,6 +70,8 @@ extern bool margo_use_tcp; extern bool margo_lazy_connect; extern int margo_client_server_pool_sz; extern int margo_server_server_pool_sz; +extern double margo_client_server_timeout_msec; +extern double margo_server_server_timeout_msec; int margo_server_rpc_init(void); int margo_server_rpc_finalize(void); diff --git a/server/src/unifyfs_group_rpc.c b/server/src/unifyfs_group_rpc.c index e839805d7..99adcf5df 100644 --- a/server/src/unifyfs_group_rpc.c +++ b/server/src/unifyfs_group_rpc.c @@ -54,7 +54,7 @@ static int forward_child_request(void* input_ptr, int ret = UNIFYFS_SUCCESS; /* call rpc function */ - double timeout_ms = UNIFYFS_MARGO_SERVER_SERVER_TIMEOUT_MSEC; + double timeout_ms = margo_server_server_timeout_msec; hg_return_t hret = margo_iforward_timed(chdl, input_ptr, timeout_ms, creq); if (hret != HG_SUCCESS) { LOGERR("failed to forward request(%p) - %s", creq, diff --git a/server/src/unifyfs_p2p_rpc.c b/server/src/unifyfs_p2p_rpc.c index 95ff050cf..a447debad 100644 --- a/server/src/unifyfs_p2p_rpc.c +++ b/server/src/unifyfs_p2p_rpc.c @@ -59,7 +59,7 @@ int forward_p2p_request(void* input_ptr, int rc = UNIFYFS_SUCCESS; /* call rpc function */ - double timeout_ms = UNIFYFS_MARGO_SERVER_SERVER_TIMEOUT_MSEC; + double timeout_ms = margo_server_server_timeout_msec; hg_return_t hret = margo_iforward_timed(req->handle, input_ptr, timeout_ms, &(req->request)); if (hret != HG_SUCCESS) { diff --git a/server/src/unifyfs_server.c b/server/src/unifyfs_server.c index eb3c55bb3..fc5686595 100644 --- a/server/src/unifyfs_server.c +++ b/server/src/unifyfs_server.c @@ -416,6 +416,16 @@ int main(int argc, char* argv[]) LOGDBG("initializing RPC service"); + rc = configurator_int_val(server_cfg.margo_client_timeout, &l); + if (0 == rc) { + margo_client_server_timeout_msec = (double) l; + } + + rc = configurator_int_val(server_cfg.margo_server_timeout, &l); + if (0 == rc) { + margo_server_server_timeout_msec = (double) l; + } + rc = configurator_int_val(server_cfg.margo_client_pool_size, &l); if (0 == rc) { margo_client_server_pool_sz = l; From b1fb26d9ff20418e0348b7ff6e532b9170963d8b Mon Sep 17 00:00:00 2001 From: Adam Moody Date: Tue, 2 Aug 2022 15:11:44 -0700 Subject: [PATCH 77/81] client: add config option to set rpc timeouts TEST_CHECKPATCH_SKIP_FILES="common/src/unifyfs_configurator.h" --- client/src/margo_client.c | 47 ++++++++++++++++++++++++++------------- client/src/margo_client.h | 3 ++- client/src/unifyfs_api.c | 12 +++++++++- 3 files changed, 45 insertions(+), 17 deletions(-) diff --git a/client/src/margo_client.c b/client/src/margo_client.c index 0151fa646..9fe9cc809 100644 --- a/client/src/margo_client.c +++ b/client/src/margo_client.c @@ -80,7 +80,7 @@ static void register_client_rpcs(client_rpc_context_t* ctx) } /* initialize margo client-server rpc */ -int unifyfs_client_rpc_init(void) +int unifyfs_client_rpc_init(double timeout_msecs) { hg_return_t hret; @@ -116,6 +116,9 @@ int unifyfs_client_rpc_init(void) return UNIFYFS_FAILURE; } + /* timeout value to use on rpc operations */ + ctx->timeout = timeout_msecs; + /* initialize margo */ int use_progress_thread = 1; int ult_pool_sz = 1; @@ -219,9 +222,11 @@ static hg_handle_t create_handle(hg_id_t id) return handle; } -static int forward_to_server(hg_handle_t hdl, void* input_ptr) +static int forward_to_server( + hg_handle_t hdl, + void* input_ptr, + double timeout_msec) { - double timeout_msec = UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC; hg_return_t hret = margo_forward_timed(hdl, input_ptr, timeout_msec); if (hret != HG_SUCCESS) { LOGERR("margo_forward_timed() failed - %s", HG_Error_to_string(hret)); @@ -251,7 +256,8 @@ int invoke_client_mount_rpc(unifyfs_client* client) /* call rpc function */ LOGDBG("invoking the mount rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of mount rpc to server failed"); margo_destroy(handle); @@ -333,7 +339,8 @@ int invoke_client_attach_rpc(unifyfs_client* client) /* call rpc function */ LOGDBG("invoking the attach rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of attach rpc to server failed"); margo_destroy(handle); @@ -380,7 +387,8 @@ int invoke_client_unmount_rpc(unifyfs_client* client) /* call rpc function */ LOGDBG("invoking the unmount rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of unmount rpc to server failed"); margo_destroy(handle); @@ -439,7 +447,8 @@ int invoke_client_metaset_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the metaset rpc function in client - gfid:%d file:%s", in.attr.gfid, in.attr.filename); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of metaset rpc to server failed"); margo_destroy(handle); @@ -486,7 +495,8 @@ int invoke_client_metaget_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the metaget rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of metaget rpc to server failed"); margo_destroy(handle); @@ -541,7 +551,8 @@ int invoke_client_filesize_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the filesize rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of filesize rpc to server failed"); margo_destroy(handle); @@ -596,7 +607,8 @@ int invoke_client_transfer_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the transfer rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of transfer rpc to server failed"); margo_destroy(handle); @@ -644,7 +656,8 @@ int invoke_client_truncate_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the truncate rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of truncate rpc to server failed"); margo_destroy(handle); @@ -690,7 +703,8 @@ int invoke_client_unlink_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the unlink rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of unlink rpc to server failed"); margo_destroy(handle); @@ -736,7 +750,8 @@ int invoke_client_laminate_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the laminate rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of laminate rpc to server failed"); margo_destroy(handle); @@ -782,7 +797,8 @@ int invoke_client_sync_rpc(unifyfs_client* client, /* call rpc function */ LOGINFO("invoking the sync rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of sync rpc to server failed"); margo_destroy(handle); @@ -841,7 +857,8 @@ int invoke_client_mread_rpc(unifyfs_client* client, /* call rpc function */ LOGDBG("invoking the mread rpc function in client"); - int rc = forward_to_server(handle, &in); + double timeout = client_rpc_context->timeout; + int rc = forward_to_server(handle, &in, timeout); if (rc != UNIFYFS_SUCCESS) { LOGERR("forward of mread rpc to server failed"); margo_destroy(handle); diff --git a/client/src/margo_client.h b/client/src/margo_client.h index 8964f6149..b5bf3f7cc 100644 --- a/client/src/margo_client.h +++ b/client/src/margo_client.h @@ -52,10 +52,11 @@ typedef struct ClientRpcContext { hg_addr_t client_addr; hg_addr_t svr_addr; client_rpcs_t rpcs; + double timeout; /* timeout to wait on rpc, in millisecs */ } client_rpc_context_t; -int unifyfs_client_rpc_init(void); +int unifyfs_client_rpc_init(double timeout_msecs); int unifyfs_client_rpc_finalize(void); diff --git a/client/src/unifyfs_api.c b/client/src/unifyfs_api.c index 16a1c269c..bdc7617c7 100644 --- a/client/src/unifyfs_api.c +++ b/client/src/unifyfs_api.c @@ -171,6 +171,16 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, client->max_write_index_entries = client->write_index_size / sizeof(unifyfs_index_t); + /* Timeout to wait on rpc calls to server, in milliseconds */ + double timeout_msecs = UNIFYFS_MARGO_CLIENT_SERVER_TIMEOUT_MSEC; + cfgval = client_cfg->margo_client_timeout; + if (cfgval != NULL) { + rc = configurator_int_val(cfgval, &l); + if (rc == 0) { + timeout_msecs = (double)l; + } + } + // initialize k-v store access int kv_rank = 0; int kv_nranks = 1; @@ -181,7 +191,7 @@ unifyfs_rc unifyfs_initialize(const char* mountpoint, } /* open rpc connection to server */ - rc = unifyfs_client_rpc_init(); + rc = unifyfs_client_rpc_init(timeout_msecs); if (rc != UNIFYFS_SUCCESS) { LOGERR("failed to initialize client RPC"); return rc; From 673106ba3d9998d06593939094860522e2537e7c Mon Sep 17 00:00:00 2001 From: Ross Miller Date: Fri, 19 Aug 2022 19:07:48 -0400 Subject: [PATCH 78/81] Documentation: change link command examples (#713) * Documentation: change link command examples Change the example link commands to use -lunifyfs_mpi_gotcha instead of -lunifyfs_gotcha. See issue LLNL/UnifyFS#711. * Update link command examples Update the linking docs to distinguish between the libraries one uses when automount is enabled and when it isn't. Added some text to explain this. * Update docs/link.rst Co-authored-by: Adam Moody * Update docs/link.rst Co-authored-by: Adam Moody * Update docs/link.rst Co-authored-by: Adam Moody * Fix typo in link.rst Co-authored-by: Adam Moody --- docs/link.rst | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/docs/link.rst b/docs/link.rst index 13849a8ae..7d07e3722 100644 --- a/docs/link.rst +++ b/docs/link.rst @@ -29,22 +29,58 @@ To make this easier, UnifyFS installs a unifyfs-config script that one should in Dynamic link ------------ +A build of UnifyFS includes two different shared libraries. Which one you +should link against depends on your application. If you wish to take advantage +of the UnifyFS auto-mount feature (assuming the feature was enabled at +compile-time), then you should link against ``libunifyfs_mpi_gotcha.so``. If +you are not building an MPI-enabled application, or if you want explicit +control over when UnifyFS filesystem is mounted and unmounted, then link +against ``libunifyfs_gotcha.so``. In this case, you will also have to add +calls to ``unifyfs_mount`` and ``unifyfs_unmount`` in the appropriate +locations in your code. See :doc:`api`. + To intercept I/O calls using gotcha, use the following syntax to link an application: C ************** +For code that uses the auto-mount feature: + +.. code-block:: Bash + + $ mpicc -o test_write test_write.c \ + -L/lib -lunifyfs_mpi_gotcha + + +For code that explicitly calls ``unifyfs_mount`` and ``unifyfs_unmount``: .. code-block:: Bash $ mpicc -o test_write test_write.c \ - -I/include -L/lib -lunifyfs_gotcha \ - -L/lib64 -lgotcha + -I/include -L/lib -lunifyfs_gotcha + +Note the use of the ``-I`` option so that the compiler knows where to find +the ``unifyfs.h`` header file. + Fortran ************** +For code that uses the auto-mount feature: + +.. code-block:: Bash + + $ mpif90 -o test_write test_write.F \ + -L/lib -lunifyfs_mpi_gotcha + +For code that explicitly calls ``unifyfs_mount`` and ``unifyfs_unmount``: .. code-block:: Bash $ mpif90 -o test_write test_write.F \ -I/include -L/lib -lunifyfsf -lunifyfs_gotcha + +Note the use of the ``-I`` option to specify the location of the +``unifyfsf.h`` header. Also note the use of the ``unifyfsf`` library. This +library provides the Fortran bindings for the ``unifyfs_mount`` and +``unifyfs_unmount`` functions. + From ce0912db3cecb758f2bf4b5de9e022f9bd3ae9fe Mon Sep 17 00:00:00 2001 From: CamStan Date: Wed, 10 Aug 2022 12:17:41 -0700 Subject: [PATCH 79/81] Update CI to run subsets of integration tests Update scripts for running the integration tests to allow for more control by specifying which suites and types to run. This allows for much easier creation of GitLab CI jobs that run specific tests by simply changing the options. Update GitLab CI jobs to make use of the more fine grained control of the testing scripts. Update .gitlab-ci.yml - Switch to using service user account - Add two new stages: - New install-deps stage - Switch to using Spack Environments - Check if existing, otherwise create and install - Make a prereq to the unifyfs build/install job(s) - New clean stage and job - create a full_clean job that only runs on a schedule and cleans up all old builds from the gitlab runner directory - Clean up default before_script to simply activate spack environment and print status - Add after_script section to integration tests to remove repo/logs after successfull jobs to save space. Print location of logs for failed jobs for debugging. - Add quartz system-specfic jobs Updates the system-specific jobs: - Create templates for making scheduled jobs - Move shared variables for each compiler-specifc jobs into templates - Add jobs to install dependencies that the build jobs depend on - Split integration tests for each compiler into multiple smaller jobs running specific tests (writeread-posix, writeread-mpiio, read-posix, read-mpiio, producer-consumer-all, and stage) - Add scheduled jobs for running stage tests across larger set of nodes Additional updates: - Update testing docs - Add .spack-env/ directory and spack.yaml files for each system/compiler for creating/installing/activating the spack environments and allow for easier maintainence and replication TEST_CHECKPATCH_SKIP_FILES=t/ci/RUN_CI_TESTS.sh --- .github/workflows/build-and-test.yml | 2 +- .gitlab-ci.yml | 122 +++-- .gitlab/ascent.yml | 186 +++++--- .gitlab/catalyst.yml | 167 +++++-- .gitlab/lassen.yml | 167 +++++-- .gitlab/quartz.yml | 175 ++++++++ .spack-env/unifyfs-lsf-gcc4_9_3/spack.yaml | 15 + .spack-env/unifyfs-lsf-gcc8_3_1/spack.yaml | 15 + .spack-env/unifyfs-slurm-gcc10_2_1/spack.yaml | 15 + .spack-env/unifyfs-slurm-gcc4_9_3/spack.yaml | 15 + docs/dependencies.rst | 2 +- docs/run.rst | 2 + docs/testing.rst | 190 ++++++-- t/ci/001-setup.sh | 55 +-- t/ci/002-start-server.sh | 4 +- t/ci/100-writeread-tests.sh | 4 +- t/ci/110-write-tests.sh | 4 +- t/ci/120-read-tests.sh | 4 +- t/ci/300-producer-consumer-tests.sh | 4 +- t/ci/RUN_CI_TESTS.sh | 423 ++++++++++++------ t/ci/setup-lsf.sh | 4 +- t/ci/setup-slurm.sh | 2 +- 22 files changed, 1168 insertions(+), 409 deletions(-) create mode 100644 .gitlab/quartz.yml create mode 100644 .spack-env/unifyfs-lsf-gcc4_9_3/spack.yaml create mode 100644 .spack-env/unifyfs-lsf-gcc8_3_1/spack.yaml create mode 100644 .spack-env/unifyfs-slurm-gcc10_2_1/spack.yaml create mode 100644 .spack-env/unifyfs-slurm-gcc4_9_3/spack.yaml diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index e3de511a7..277f6fa4f 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -113,7 +113,7 @@ jobs: - name: Install UnifyFS dependencies run: | - spack install gotcha@1.0.3 + spack install gotcha@develop spack install mochi-margo@0.9.6 ^mercury~boostsys ^libfabric@1.12.1 fabrics=rxm,sockets,tcp spack install spath~mpi echo "GOTCHA_INSTALL=$(spack location -i gotcha)" >> $GITHUB_ENV diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 0d864977d..ac8bd21a1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,9 +3,11 @@ # DAG. stages: - init + - install-deps - build - test-unit - test-integ + - clean ##### System Templates ##### @@ -14,10 +16,12 @@ stages: # these are LLNL specific, but can be adjusted or added to as new # systems become available. # -# The NNODES, WALL_TIME, and STORAGE_SIZE variables can be altered in -# Gitlab interface if/when the defaults need to be changed. +# The NNODES, WALL_TIME, QUEUE, and STORAGE_SIZE variables can be altered in +# Gitlab web interface if/when the defaults need to be changed. .base-template: + variables: + LLNL_SERVICE_USER: unifysrv retry: max: 1 when: @@ -26,21 +30,21 @@ stages: .slurm-single-node-template: variables: - LLNL_SLURM_SCHEDULER_PARAMETERS: "-N 1 -p $QUEUE -t $UNIT_WALL_TIME -J unifyfs-unit-tests" + LLNL_SLURM_SCHEDULER_PARAMETERS: "-N 1 -p $QUEUE -t $UNIT_WALL_TIME" .slurm-multi-node-template: variables: - LLNL_SLURM_SCHEDULER_PARAMETERS: "-N $NNODES -p $QUEUE -t $INTEG_WALL_TIME -J unifyfs-integ-tests" + LLNL_SLURM_SCHEDULER_PARAMETERS: "-N $NNODES -p $QUEUE -t $INTEG_WALL_TIME" .lsf-single-node-template: variables: - LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes 1 -q $QUEUE -W $UNIT_WALL_TIME -J unifyfs-unit-tests" - SCHEDULER_PARAMETERS: "-nnodes 1 -P $PROJECT_ID -W $UNIT_WALL_TIME -J unifyfs-unit-tests" + LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes 1 -q $QUEUE -W $UNIT_WALL_TIME" + SCHEDULER_PARAMETERS: "-nnodes 1 -P $PROJECT_ID -W $UNIT_WALL_TIME" .lsf-multi-node-template: variables: - LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes $NNODES $STAGE_STORAGE -q $QUEUE -W $INTEG_WALL_TIME -J unifyfs-integ-tests" - SCHEDULER_PARAMETERS: "-nnodes $NNODES -P $PROJECT_ID -W $INTEG_WALL_TIME -J unifyfs-integ-tests" + LLNL_LSF_SCHEDULER_PARAMETERS: "-nnodes $NNODES $STAGE_STORAGE -q $QUEUE -W $INTEG_WALL_TIME" + SCHEDULER_PARAMETERS: "-nnodes $NNODES -P $PROJECT_ID -W $INTEG_WALL_TIME" ##### Job Templates ##### @@ -58,9 +62,23 @@ stages: script: - git clone -b ${CI_COMMIT_BRANCH} --depth=1 ${CI_REPOSITORY_URL} $WORKING_DIR -# Build script used by each system. The CC and FC variables are set in -# the specific job scripts and evaluated in the before_script in order -# to customize which compiler will be used for each job. +# Check if Spack is installed, but don't install if not. Doing so may create +# issues with the user's/service user's Spack installation. +# Check if the SPACK_ENV_NAME set in the job already exists. If so, this does +# nothing. If not, create and install the Spack Environment for subsequent jobs +# to use. +.install-deps-template: + stage: install-deps + before_script: + - which spack || ((cd $HOME/spack && git describe) && . $HOME/spack/share/spack/setup-env.sh) + - module load $COMPILER + script: + - spack env list | grep $SPACK_ENV_NAME || (spack env create $SPACK_ENV_NAME .spack-env/${SPACK_ENV_NAME}/spack.yaml && spack env activate $SPACK_ENV_NAME && spack install --fail-fast) + needs: [] + +# Build script used by each system to build UnifyFS. The CC and FC variables are +# set in the specific job scripts and evaluated here in order to customize which +# compiler will be used for each job. # An artifact is created to pass on to the testing stages. The # test-unit stage requires the unifyfs-build/ files and the test-integ # stage requires the unifyfs-install/ files. @@ -69,18 +87,17 @@ stages: script: - CC_PATH=$($CC_COMMAND) - FC_PATH=$($FC_COMMAND) - - GOTCHA_INSTALL=$(spack location -i gotcha %$SPACK_COMPILER arch=$SPACK_ARCH) - - SPATH_INSTALL=$(spack location -i spath %$SPACK_COMPILER arch=$SPACK_ARCH) + - GOTCHA_INSTALL=$(spack location -i gotcha) + - SPATH_INSTALL=$(spack location -i spath) - ./autogen.sh - mkdir -p unifyfs-build unifyfs-install && cd unifyfs-build - ../configure CC=$CC_PATH FC=$FC_PATH --prefix=${WORKING_DIR}/unifyfs-install --with-gotcha=$GOTCHA_INSTALL --with-spath=$SPATH_INSTALL --enable-fortran --disable-silent-rules - make V=1 - make V=1 install - needs: [] artifacts: name: "${CI_JOB_NAME}-${CI_PIPELINE_ID}" untracked: true - expire_in: 1 hour + expire_in: 6 hour paths: - unifyfs-build/ - unifyfs-install/ @@ -92,46 +109,69 @@ stages: after_script: - rm -rf /tmp/unify* /tmp/tmp.* /tmp/mdhim* /tmp/na_sm | true -# Variables here are used for the integration test suite and can be -# adjusted in the Gitlab interface. See our testing documentation for -# full details. +# Run the integration test suite with the options provided from the specific +# job. +# Variables used in the integration test suite can be adjusted in the Gitlab web +# interface. See our testing documentation for full details. +# If the job was successful, clean up the build directory in the after_script. .integ-test-template: stage: test-integ script: - - cd t/ci && unbuffer prove -v RUN_CI_TESTS.sh + - cd t/ci && unbuffer prove -v RUN_CI_TESTS.sh $CI_TEST_OPTIONS + after_script: + - > + echo "CI_JOB_STATUS: $CI_JOB_STATUS"; + if [[ $CI_JOB_STATUS == 'success' ]]; then + echo "Cleaning CI_BUILDS_DIR: $CI_BUILDS_DIR" + rm -rf $CI_BUILDS_DIR + elif [[ $CI_JOB_STATUS == 'failed' ]]; then + echo "Logs for debugging available in:" + echo "CI_BUILDS_DIR: $CI_BUILDS_DIR" + else + echo "Job status unknown" + fi ##### Jobs ##### -# Since Gitlab currently runs in the user's home environment, the -# before_script is currently only set up to load the proper Spack -# modules, if they are available, to prevent changing the user's -# environment. Install any needed modules in the user's environment -# prior to running when new compilers or architectures need to be -# tested. +# Default before_script for each job. Use an alternative working directory (if +# necessary), module load the compiler associated with this job, and activate +# the Spack Environment associated with this job. +# +# For jobs running in the not-default location, change directories to the +# WORKING_DIR directory. Otherwise, set WORKING_DIR to be the CI_PROJECT_DIR for +# the build step. +# TODO: Look into CUSTOM_CI_BUILDS_DIR as an alternative (must work on ascent). # -# For jobs running in the not-default location, change directories -# to the WORKING_DIR directory. Otherwise, set WORKING_DIR to be the -# CI_PROJECT_DIR for the build step. +# The COMPILER variable (evaluated here) is set in the specific job scripts. # -# The COMPILER, CC_PATH, and FC_PATH variables are evaluated here. Set -# them in their specific job scripts. -# SPACK_COMPILER and SPACK_ARCH are then set to load the matching -# dependencies for the desired compiler. +# Activate the Spack Environment created in the install-deps job. before_script: - - which spack || ((cd $HOME/spack && git describe) && . $HOME/spack/share/spack/setup-env.sh) - if [[ -d $WORKING_DIR ]]; then cd ${WORKING_DIR}; else export WORKING_DIR=${CI_PROJECT_DIR}; fi - module load $COMPILER - - SPACK_COMPILER=${COMPILER//\//@} - - SPACK_ARCH="$(spack arch -p)-$(spack arch -o)-$(uname -m)" - - spack load automake@1.15 %$SPACK_COMPILER arch=$SPACK_ARCH - - spack load gotcha %$SPACK_COMPILER arch=$SPACK_ARCH - - spack load argobots %$SPACK_COMPILER arch=$SPACK_ARCH - - spack load mercury %$SPACK_COMPILER arch=$SPACK_ARCH - - spack load mochi-margo %$SPACK_COMPILER arch=$SPACK_ARCH - - spack load spath %$SPACK_COMPILER arch=$SPACK_ARCH + - spack env activate $SPACK_ENV_NAME && spack env status && spack find + +# Scheduled job to fully clean the runner directory to avoid space issues that +# may accumulate over time. +# +# Running with lsf tag here, but this may be too vague as the job may attempt to +# use a runner on a system the service user doesn't have permission to access. +# If so, move this job to a specific system. +full_clean: + stage: clean + extends: .base-template + variables: + GIT_STRATEGY: none + rules: + - if: $FULL_CLEAN == "YES" && $CI_PIPELINE_SOURCE == "schedule" + tags: + - shell + before_script: [] + script: rm -rf ${HOME}/.jacamar-ci/* + needs: [] # System specific jobs include: - local: .gitlab/ascent.yml - local: .gitlab/catalyst.yml - local: .gitlab/lassen.yml + - local: .gitlab/quartz.yml diff --git a/.gitlab/ascent.yml b/.gitlab/ascent.yml index 8fee5129f..f1f68bf81 100644 --- a/.gitlab/ascent.yml +++ b/.gitlab/ascent.yml @@ -3,7 +3,7 @@ # The WORKING_DIR envar is defined to allow the init job to clone the # git repo to a different location than the default. Subsequent jobs # will then `cd` to this directory during their before_script stage. -# The WORKING_DIR_BASE envar is definied in the Gitlab UI. +# The WORKING_DIR_BASE envar is defined in the Gitlab Web UI. # # The RUN_ASCENT variable can be toggled in the Gitlab interface to # toggle whether jobs should be run on this system. @@ -14,8 +14,19 @@ rules: - if: '$RUN_ASCENT != "ON"' when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never - when: on_success +.ascent-scheduled-template: + variables: + WORKING_DIR: ${WORKING_DIR_BASE}/${CI_PIPELINE_ID}/source + extends: .base-template + rules: + - if: '$RUN_ASCENT != "ON"' + when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + .ascent-shell-template: extends: .ascent-template tags: [nobatch] @@ -26,90 +37,155 @@ extends: .ascent-template tags: [batch] +.ascent-scheduled-shell-template: + extends: .ascent-scheduled-template + tags: [nobatch] + +.ascent-scheduled-batch-template: + variables: + JOB_LAUNCH_COMMAND: "jsrun -r1 -n1" + extends: .ascent-scheduled-template + tags: [batch] + ##### All Ascent Jobs ##### ### gcc@4.8.5 ### -ascent-gcc-4_8_5-init: - extends: [.ascent-shell-template, .init-template] - -ascent-gcc-4_8_5-build: +.ascent-gcc-4_8_5-template: variables: COMPILER: gcc/4.8.5 CC_COMMAND: "which gcc" FC_COMMAND: "which gfortran" - extends: [.ascent-shell-template, .build-template] + SPACK_ENV_NAME: "unifyfs-lsf-gcc4_8_5" + +ascent-gcc-4_8_5-init: + extends: [.ascent-shell-template, .init-template] + +ascent-gcc-4_8_5-install-deps: + extends: [.ascent-shell-template, .ascent-gcc-4_8_5-template, .install-deps-template] needs: ["ascent-gcc-4_8_5-init"] +ascent-gcc-4_8_5-build: + extends: [.ascent-shell-template, .ascent-gcc-4_8_5-template, .build-template] + needs: ["ascent-gcc-4_8_5-install-deps"] + ascent-gcc-4_8_5-unit-test: + extends: [.lsf-single-node-template, .ascent-batch-template, .ascent-gcc-4_8_5-template, .unit-test-template] + needs: ["ascent-gcc-4_8_5-build"] + +# Integration tests + +ascent-gcc-4_8_5-integ-test-writeread-posix: variables: - COMPILER: gcc/4.8.5 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-single-node-template, .ascent-batch-template, .unit-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-4_8_5-template, .integ-test-template] needs: ["ascent-gcc-4_8_5-build"] -ascent-gcc-4_8_5-integ-test: +ascent-gcc-4_8_5-integ-test-writeread-mpiio: variables: - COMPILER: gcc/4.8.5 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-multi-node-template, .ascent-batch-template, .integ-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-4_8_5-template, .integ-test-template] needs: ["ascent-gcc-4_8_5-build"] +ascent-gcc-4_8_5-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-4_8_5-template, .integ-test-template] + needs: ["ascent-gcc-4_8_5-build"] -### gcc@10.2.0 ### -ascent-gcc-10_2_0-init: - extends: [.ascent-shell-template, .init-template] +ascent-gcc-4_8_5-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-4_8_5-template, .integ-test-template] + needs: ["ascent-gcc-4_8_5-build"] -ascent-gcc-10_2_0-build: +ascent-gcc-4_8_5-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-4_8_5-template, .integ-test-template] + needs: ["ascent-gcc-4_8_5-build"] + +ascent-gcc-4_8_5-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-4_8_5-template, .integ-test-template] + needs: ["ascent-gcc-4_8_5-build"] + + +### gcc@10.2.0 ### +.ascent-gcc-10_2_0-template: variables: COMPILER: gcc/10.2.0 CC_COMMAND: "which gcc" FC_COMMAND: "which gfortran" - extends: [.ascent-shell-template, .build-template] + +ascent-gcc-10_2_0-init: + extends: [.ascent-shell-template, .init-template] + +ascent-gcc-10_2_0-install-deps: + extends: [.ascent-shell-template, .ascent-gcc-10_2_0-template, .install-deps-template] needs: ["ascent-gcc-10_2_0-init"] +ascent-gcc-10_2_0-build: + extends: [.ascent-shell-template, .ascent-gcc-10_2_0-template, .build-template] + needs: ["ascent-gcc-10_2_0-install-deps"] + ascent-gcc-10_2_0-unit-test: + extends: [.lsf-single-node-template, .ascent-batch-template, .ascent-gcc-10_2_0-template, .unit-test-template] + needs: ["ascent-gcc-10_2_0-build"] + +# Integration tests + +ascent-gcc-10_2_0-integ-test-writeread-posix: variables: - COMPILER: gcc/10.2.0 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-single-node-template, .ascent-batch-template, .unit-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-10_2_0-template, .integ-test-template] needs: ["ascent-gcc-10_2_0-build"] -ascent-gcc-10_2_0-integ-test: +ascent-gcc-10_2_0-integ-test-writeread-mpiio: variables: - COMPILER: gcc/10.2.0 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-multi-node-template, .ascent-batch-template, .integ-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-10_2_0-template, .integ-test-template] + needs: ["ascent-gcc-10_2_0-build"] + +ascent-gcc-10_2_0-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-10_2_0-template, .integ-test-template] needs: ["ascent-gcc-10_2_0-build"] +ascent-gcc-10_2_0-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-10_2_0-template, .integ-test-template] + needs: ["ascent-gcc-10_2_0-build"] -### xl@16.1 ### -#ascent-xl-16_1-init: -# extends: [.ascent-shell-template, .init-template] -# -#ascent-xl-16_1-build: -# variables: -# COMPILER: xl/16.1 -# CC_COMMAND: "which xlc" -# FC_COMMAND: "which xlf90" -# extends: [.ascent-shell-template, .build-template] -# needs: ["ascent-xl-16_1-init"] -# -#ascent-xl-16_1-unit-test: -# variables: -# COMPILER: xl/16.1 -# CC_COMMAND: "which xlc" -# FC_COMMAND: "which xlf90" -# extends: [.lsf-single-node-template, .ascent-batch-template, .unit-test-template] -# needs: ["ascent-xl-16_1-build"] +ascent-gcc-10_2_0-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-10_2_0-template, .integ-test-template] + needs: ["ascent-gcc-10_2_0-build"] + +ascent-gcc-10_2_0-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.lsf-multi-node-template, .ascent-batch-template, .ascent-gcc-10_2_0-template, .integ-test-template] + needs: ["ascent-gcc-10_2_0-build"] + +# Larger Scheduled Stage Job(s) # -#ascent-xl-16_1-integ-test: -# variables: -# COMPILER: xlc/16.1 -# CC_COMMAND: "which xlc" -# FC_COMMAND: "which xlf90" -# extends: [.lsf-multi-node-template, .ascent-batch-template, .integ-test-template] -# needs: ["ascent-xl-16_1-build"] +ascent-gcc-10_2_0-scheduled-init: + extends: [.ascent-scheduled-shell-template, .init-template] + +ascent-gcc-10_2_0-scheduled-install-deps: + extends: [.ascent-scheduled-shell-template, .ascent-gcc-10_2_0-template, .install-deps-template] + needs: ["ascent-gcc-10_2_0-scheduled-init"] + +ascent-gcc-10_2_0-scheduled-build: + extends: [.ascent-scheduled-shell-template, .ascent-gcc-10_2_0-template, .build-template] + needs: ["ascent-gcc-10_2_0-scheduled-install-deps"] + +ascent-gcc-10_2_0-integ-test-scheduled-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.lsf-multi-node-template, .ascent-scheduled-batch-template, .ascent-gcc-10_2_0-template, .integ-test-template] + needs: ["ascent-gcc-10_2_0-scheduled-build"] diff --git a/.gitlab/catalyst.yml b/.gitlab/catalyst.yml index 18b96159d..8ac467b6f 100644 --- a/.gitlab/catalyst.yml +++ b/.gitlab/catalyst.yml @@ -7,8 +7,17 @@ rules: - if: '$RUN_CATALYST != "ON"' when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never - when: on_success +.catalyst-scheduled-template: + extends: .base-template + rules: + - if: '$RUN_CATALYST != "ON"' + when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + .catalyst-shell-template: extends: .catalyst-template tags: @@ -21,78 +30,146 @@ - catalyst - batch +.catalyst-scheduled-shell-template: + extends: .catalyst-scheduled-template + tags: + - catalyst + - shell + +.catalyst-scheduled-batch-template: + extends: .catalyst-scheduled-template + tags: + - catalyst + - batch + ##### All Catalyst Jobs ##### ### gcc@4.9.3 ### -catalyst-gcc-4_9_3-build: +.catalyst-gcc-4_9_3-template: variables: COMPILER: gcc/4.9.3 CC_COMMAND: "which gcc" FC_COMMAND: "which gfortran" - extends: [.catalyst-shell-template, .build-template] + SPACK_ENV_NAME: "unifyfs-slurm-gcc4_9_3" + +catalyst-gcc-4_9_3-install-deps: + extends: [.catalyst-shell-template, .catalyst-gcc-4_9_3-template, .install-deps-template] + +catalyst-gcc-4_9_3-build: + extends: [.catalyst-shell-template, .catalyst-gcc-4_9_3-template, .build-template] + needs: ["catalyst-gcc-4_9_3-install-deps"] catalyst-gcc-4_9_3-unit-test: + extends: [.slurm-single-node-template, .catalyst-batch-template, .catalyst-gcc-4_9_3-template, .unit-test-template] + needs: ["catalyst-gcc-4_9_3-build"] + +# Integration tests + +catalyst-gcc-4_9_3-integ-test-writeread-posix: variables: - COMPILER: gcc/4.9.3 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.slurm-single-node-template, .catalyst-batch-template, .unit-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-4_9_3-template, .integ-test-template] needs: ["catalyst-gcc-4_9_3-build"] -catalyst-gcc-4_9_3-integ-test: +catalyst-gcc-4_9_3-integ-test-writeread-mpiio: variables: - COMPILER: gcc/4.9.3 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.slurm-multi-node-template, .catalyst-batch-template, .integ-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-4_9_3-template, .integ-test-template] + needs: ["catalyst-gcc-4_9_3-build"] + +catalyst-gcc-4_9_3-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-4_9_3-template, .integ-test-template] + needs: ["catalyst-gcc-4_9_3-build"] + +catalyst-gcc-4_9_3-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-4_9_3-template, .integ-test-template] + needs: ["catalyst-gcc-4_9_3-build"] + +catalyst-gcc-4_9_3-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-4_9_3-template, .integ-test-template] + needs: ["catalyst-gcc-4_9_3-build"] + +catalyst-gcc-4_9_3-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-4_9_3-template, .integ-test-template] needs: ["catalyst-gcc-4_9_3-build"] ### gcc@10.2.1 ### -catalyst-gcc-10_2_1-build: +.catalyst-gcc-10_2_1-template: variables: COMPILER: gcc/10.2.1 CC_COMMAND: "which gcc" FC_COMMAND: "which gfortran" - extends: [.catalyst-shell-template, .build-template] + SPACK_ENV_NAME: "unifyfs-slurm-gcc10_2_1" + +catalyst-gcc-10_2_1-install-deps: + extends: [.catalyst-shell-template, .catalyst-gcc-10_2_1-template, .install-deps-template] + +catalyst-gcc-10_2_1-build: + extends: [.catalyst-shell-template, .catalyst-gcc-10_2_1-template, .build-template] + needs: ["catalyst-gcc-10_2_1-install-deps"] catalyst-gcc-10_2_1-unit-test: + extends: [.slurm-single-node-template, .catalyst-batch-template, .catalyst-gcc-10_2_1-template, .unit-test-template] + needs: ["catalyst-gcc-10_2_1-build"] + +# Integration tests + +catalyst-gcc-10_2_1-integ-test-writeread-posix: variables: - COMPILER: gcc/10.2.1 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.slurm-single-node-template, .catalyst-batch-template, .unit-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-10_2_1-template, .integ-test-template] needs: ["catalyst-gcc-10_2_1-build"] -catalyst-gcc-10_2_1-integ-test: +catalyst-gcc-10_2_1-integ-test-writeread-mpiio: variables: - COMPILER: gcc/10.2.1 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.slurm-multi-node-template, .catalyst-batch-template, .integ-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-10_2_1-template, .integ-test-template] + needs: ["catalyst-gcc-10_2_1-build"] + +catalyst-gcc-10_2_1-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-10_2_1-template, .integ-test-template] + needs: ["catalyst-gcc-10_2_1-build"] + +catalyst-gcc-10_2_1-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-10_2_1-template, .integ-test-template] + needs: ["catalyst-gcc-10_2_1-build"] + +catalyst-gcc-10_2_1-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-10_2_1-template, .integ-test-template] + needs: ["catalyst-gcc-10_2_1-build"] + +catalyst-gcc-10_2_1-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.slurm-multi-node-template, .catalyst-batch-template, .catalyst-gcc-10_2_1-template, .integ-test-template] needs: ["catalyst-gcc-10_2_1-build"] +# Larger Scheduled Stage Job(s) + +catalyst-gcc-10_2_1-scheduled-install-deps: + extends: [.catalyst-scheduled-shell-template, .catalyst-gcc-10_2_1-template, .install-deps-template] -### intel@19.0.4 ### -#catalyst-intel-19_0_4-build: -# variables: -# COMPILER: intel/19.0.4 -# CC_COMMAND: "which icc" -# FC_COMMAND: "which ifort" -# extends: [.catalyst-shell-template, .build-template] -# -#catalyst-intel-19_0_4-unit-test: -# variables: -# COMPILER: intel/19.0.4 -# CC_COMMAND: "which icc" -# FC_COMMAND: "which ifort" -# extends: [.slurm-single-node-template, .catalyst-batch-template, .unit-test-template] -# needs: ["catalyst-intel-19_0_4-build"] -# -#catalyst-intel-19_0_4-integ-test: -# variables: -# COMPILER: intel/19.0.4 -# CC_COMMAND: "which icc" -# FC_COMMAND: "which ifort" -# extends: [.slurm-multi-node-template, .catalyst-batch-template, .integ-test-template] -# needs: ["catalyst-intel-19_0_4-build"] +catalyst-gcc-10_2_1-scheduled-build: + extends: [.catalyst-scheduled-shell-template, .catalyst-gcc-10_2_1-template, .build-template] + needs: ["catalyst-gcc-10_2_1-scheduled-install-deps"] + +catalyst-gcc-10_2_1-integ-test-scheduled-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.slurm-multi-node-template, .catalyst-scheduled-batch-template, .catalyst-gcc-10_2_1-template, .integ-test-template] + needs: ["catalyst-gcc-10_2_1-scheduled-build"] diff --git a/.gitlab/lassen.yml b/.gitlab/lassen.yml index a4b9a79b9..ea88e384e 100644 --- a/.gitlab/lassen.yml +++ b/.gitlab/lassen.yml @@ -7,8 +7,17 @@ rules: - if: '$RUN_LASSEN != "ON"' when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never - when: on_success +.lassen-scheduled-template: + extends: .base-template + rules: + - if: '$RUN_LASSEN != "ON"' + when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + .lassen-shell-template: extends: .lassen-template tags: @@ -21,78 +30,146 @@ - lassen - batch +.lassen-scheduled-shell-template: + extends: .lassen-scheduled-template + tags: + - lassen + - shell + +.lassen-scheduled-batch-template: + extends: .lassen-scheduled-template + tags: + - lassen + - batch + ##### All Lassen Jobs ##### ### gcc@4.9.3 ### -lassen-gcc-4_9_3-build: +.lassen-gcc-4_9_3-template: variables: COMPILER: gcc/4.9.3 CC_COMMAND: "which gcc" FC_COMMAND: "which gfortran" - extends: [.lassen-shell-template, .build-template] + SPACK_ENV_NAME: "unifyfs-lsf-gcc4_9_3" + +lassen-gcc-4_9_3-install-deps: + extends: [.lassen-shell-template, .lassen-gcc-4_9_3-template, .install-deps-template] + +lassen-gcc-4_9_3-build: + extends: [.lassen-shell-template, .lassen-gcc-4_9_3-template, .build-template] + needs: ["lassen-gcc-4_9_3-install-deps"] lassen-gcc-4_9_3-unit-test: + extends: [.lsf-single-node-template, .lassen-batch-template, .lassen-gcc-4_9_3-template, .unit-test-template] + needs: ["lassen-gcc-4_9_3-build"] + +# Integration tests + +lassen-gcc-4_9_3-integ-test-writeread-posix: variables: - COMPILER: gcc/4.9.3 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-single-node-template, .lassen-batch-template, .unit-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-4_9_3-template, .integ-test-template] needs: ["lassen-gcc-4_9_3-build"] -lassen-gcc-4_9_3-integ-test: +lassen-gcc-4_9_3-integ-test-writeread-mpiio: variables: - COMPILER: gcc/4.9.3 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-multi-node-template, .lassen-batch-template, .integ-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-4_9_3-template, .integ-test-template] + needs: ["lassen-gcc-4_9_3-build"] + +lassen-gcc-4_9_3-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-4_9_3-template, .integ-test-template] + needs: ["lassen-gcc-4_9_3-build"] + +lassen-gcc-4_9_3-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-4_9_3-template, .integ-test-template] + needs: ["lassen-gcc-4_9_3-build"] + +lassen-gcc-4_9_3-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-4_9_3-template, .integ-test-template] + needs: ["lassen-gcc-4_9_3-build"] + +lassen-gcc-4_9_3-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-4_9_3-template, .integ-test-template] needs: ["lassen-gcc-4_9_3-build"] ### gcc@8.3.1 ### -lassen-gcc-8_3_1-build: +.lassen-gcc-8_3_1-template: variables: COMPILER: gcc/8.3.1 CC_COMMAND: "which gcc" FC_COMMAND: "which gfortran" - extends: [.lassen-shell-template, .build-template] + SPACK_ENV_NAME: "unifyfs-lsf-gcc8_3_1" + +lassen-gcc-8_3_1-install-deps: + extends: [.lassen-shell-template, .lassen-gcc-8_3_1-template, .install-deps-template] + +lassen-gcc-8_3_1-build: + extends: [.lassen-shell-template, .lassen-gcc-8_3_1-template, .build-template] + needs: ["lassen-gcc-8_3_1-install-deps"] lassen-gcc-8_3_1-unit-test: + extends: [.lsf-single-node-template, .lassen-batch-template, .lassen-gcc-8_3_1-template, .unit-test-template] + needs: ["lassen-gcc-8_3_1-build"] + +# Integration tests + +lassen-gcc-8_3_1-integ-test-writeread-posix: variables: - COMPILER: gcc/8.3.1 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-single-node-template, .lassen-batch-template, .unit-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-8_3_1-template, .integ-test-template] needs: ["lassen-gcc-8_3_1-build"] -lassen-gcc-8_3_1-integ-test: +lassen-gcc-8_3_1-integ-test-writeread-mpiio: variables: - COMPILER: gcc/8.3.1 - CC_COMMAND: "which gcc" - FC_COMMAND: "which gfortran" - extends: [.lsf-multi-node-template, .lassen-batch-template, .integ-test-template] + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-8_3_1-template, .integ-test-template] + needs: ["lassen-gcc-8_3_1-build"] + +lassen-gcc-8_3_1-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-8_3_1-template, .integ-test-template] + needs: ["lassen-gcc-8_3_1-build"] + +lassen-gcc-8_3_1-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-8_3_1-template, .integ-test-template] + needs: ["lassen-gcc-8_3_1-build"] + +lassen-gcc-8_3_1-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-8_3_1-template, .integ-test-template] + needs: ["lassen-gcc-8_3_1-build"] + +lassen-gcc-8_3_1-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.lsf-multi-node-template, .lassen-batch-template, .lassen-gcc-8_3_1-template, .integ-test-template] needs: ["lassen-gcc-8_3_1-build"] +# Larger Scheduled Stage Job(s) + +lassen-gcc-8_3_1-scheduled-install-deps: + extends: [.lassen-scheduled-shell-template, .lassen-gcc-8_3_1-template, .install-deps-template] -### xl@16.1 ### -#lassen-xl-16_1-build: -# variables: -# COMPILER: xl/16.1 -# CC_COMMAND: "which xlc" -# FC_COMMAND: "which xlf90" -# extends: [.lassen-shell-template, .build-template] -# -#lassen-xl-16_1-unit-test: -# variables: -# COMPILER: xl/16.1 -# CC_COMMAND: "which xlc" -# FC_COMMAND: "which xlf90" -# extends: [.lsf-single-node-template, .lassen-batch-template, .unit-test-template] -# needs: ["lassen-xl-16_1-build"] -# -#lassen-xl-16_1-integ-test: -# variables: -# COMPILER: xlc/16.1 -# CC_COMMAND: "which xlc" -# FC_COMMAND: "which xlf90" -# extends: [.lsf-multi-node-template, .lassen-batch-template, .integ-test-template] -# needs: ["lassen-xl-16_1-build"] +lassen-gcc-8_3_1-scheduled-build: + extends: [.lassen-scheduled-shell-template, .lassen-gcc-8_3_1-template, .build-template] + needs: ["lassen-gcc-8_3_1-scheduled-install-deps"] + +lassen-gcc-8_3_1-integ-test-scheduled-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.lsf-multi-node-template, .lassen-scheduled-batch-template, .lassen-gcc-8_3_1-template, .integ-test-template] + needs: ["lassen-gcc-8_3_1-scheduled-build"] diff --git a/.gitlab/quartz.yml b/.gitlab/quartz.yml new file mode 100644 index 000000000..68af7a780 --- /dev/null +++ b/.gitlab/quartz.yml @@ -0,0 +1,175 @@ +# Quartz Templates + +# The RUN_QUARTZ variable can be toggled in the Gitlab interface to +# toggle whether jobs should be run on this system. +.quartz-template: + extends: .base-template + rules: + - if: '$RUN_QUARTZ != "ON"' + when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + when: never + - when: on_success + +.quartz-scheduled-template: + extends: .base-template + rules: + - if: '$RUN_QUARTZ != "ON"' + when: never + - if: '$CI_PIPELINE_SOURCE == "schedule"' + +.quartz-shell-template: + extends: .quartz-template + tags: + - quartz + - shell + +.quartz-batch-template: + extends: .quartz-template + tags: + - quartz + - batch + +.quartz-scheduled-shell-template: + extends: .quartz-scheduled-template + tags: + - quartz + - shell + +.quartz-scheduled-batch-template: + extends: .quartz-scheduled-template + tags: + - quartz + - batch + +##### All Quartz Jobs ##### + +### gcc@4.9.3 ### +.quartz-gcc-4_9_3-template: + variables: + COMPILER: gcc/4.9.3 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + SPACK_ENV_NAME: "unifyfs-slurm-gcc4_9_3" + +quartz-gcc-4_9_3-install-deps: + extends: [.quartz-shell-template, .quartz-gcc-4_9_3-template, .install-deps-template] + +quartz-gcc-4_9_3-build: + extends: [.quartz-shell-template, .quartz-gcc-4_9_3-template, .build-template] + needs: ["quartz-gcc-4_9_3-install-deps"] + +quartz-gcc-4_9_3-unit-test: + extends: [.slurm-single-node-template, .quartz-batch-template, .quartz-gcc-4_9_3-template, .unit-test-template] + needs: ["quartz-gcc-4_9_3-build"] + +# Integration tests + +quartz-gcc-4_9_3-integ-test-writeread-posix: + variables: + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-4_9_3-template, .integ-test-template] + needs: ["quartz-gcc-4_9_3-build"] + +quartz-gcc-4_9_3-integ-test-writeread-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-4_9_3-template, .integ-test-template] + needs: ["quartz-gcc-4_9_3-build"] + +quartz-gcc-4_9_3-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-4_9_3-template, .integ-test-template] + needs: ["quartz-gcc-4_9_3-build"] + +quartz-gcc-4_9_3-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-4_9_3-template, .integ-test-template] + needs: ["quartz-gcc-4_9_3-build"] + +quartz-gcc-4_9_3-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-4_9_3-template, .integ-test-template] + needs: ["quartz-gcc-4_9_3-build"] + +quartz-gcc-4_9_3-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-4_9_3-template, .integ-test-template] + needs: ["quartz-gcc-4_9_3-build"] + + +### gcc@10.2.1 ### +.quartz-gcc-10_2_1-template: + variables: + COMPILER: gcc/10.2.1 + CC_COMMAND: "which gcc" + FC_COMMAND: "which gfortran" + SPACK_ENV_NAME: "unifyfs-slurm-gcc10_2_1" + +quartz-gcc-10_2_1-install-deps: + extends: [.quartz-shell-template, .quartz-gcc-10_2_1-template, .install-deps-template] + +quartz-gcc-10_2_1-build: + extends: [.quartz-shell-template, .quartz-gcc-10_2_1-template, .build-template] + needs: ["quartz-gcc-10_2_1-install-deps"] + +quartz-gcc-10_2_1-unit-test: + extends: [.slurm-single-node-template, .quartz-batch-template, .quartz-gcc-10_2_1-template, .unit-test-template] + needs: ["quartz-gcc-10_2_1-build"] + +# Integration tests + +quartz-gcc-10_2_1-integ-test-writeread-posix: + variables: + CI_TEST_OPTIONS: ":: -s writeread -t posix" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-10_2_1-template, .integ-test-template] + needs: ["quartz-gcc-10_2_1-build"] + +quartz-gcc-10_2_1-integ-test-writeread-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s writeread -t mpiio" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-10_2_1-template, .integ-test-template] + needs: ["quartz-gcc-10_2_1-build"] + +quartz-gcc-10_2_1-integ-test-read-posix: + variables: + CI_TEST_OPTIONS: ":: -s read -t posix" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-10_2_1-template, .integ-test-template] + needs: ["quartz-gcc-10_2_1-build"] + +quartz-gcc-10_2_1-integ-test-read-mpiio: + variables: + CI_TEST_OPTIONS: ":: -s read -t mpiio" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-10_2_1-template, .integ-test-template] + needs: ["quartz-gcc-10_2_1-build"] + +quartz-gcc-10_2_1-integ-test-pc-all: + variables: + CI_TEST_OPTIONS: ":: -s pc -t all" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-10_2_1-template, .integ-test-template] + needs: ["quartz-gcc-10_2_1-build"] + +quartz-gcc-10_2_1-integ-test-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.slurm-multi-node-template, .quartz-batch-template, .quartz-gcc-10_2_1-template, .integ-test-template] + needs: ["quartz-gcc-10_2_1-build"] + +# Larger Scheduled Stage Job(s) + +quartz-gcc-10_2_1-scheduled-install-deps: + extends: [.quartz-scheduled-shell-template, .quartz-gcc-10_2_1-template, .install-deps-template] + +quartz-gcc-10_2_1-scheduled-build: + extends: [.quartz-scheduled-shell-template, .quartz-gcc-10_2_1-template, .build-template] + needs: ["quartz-gcc-10_2_1-scheduled-install-deps"] + +quartz-gcc-10_2_1-integ-test-scheduled-stage: + variables: + CI_TEST_OPTIONS: ":: -s stage" + extends: [.slurm-multi-node-template, .quartz-scheduled-batch-template, .quartz-gcc-10_2_1-template, .integ-test-template] + needs: ["quartz-gcc-10_2_1-scheduled-build"] diff --git a/.spack-env/unifyfs-lsf-gcc4_9_3/spack.yaml b/.spack-env/unifyfs-lsf-gcc4_9_3/spack.yaml new file mode 100644 index 000000000..0e039407d --- /dev/null +++ b/.spack-env/unifyfs-lsf-gcc4_9_3/spack.yaml @@ -0,0 +1,15 @@ +spack: + packages: + all: + compiler: [gcc@4.9.3] + providers: + mpi: [spectrum-mpi] + target: [ppc64le] + specs: + - automake@1.15.1 + - gotcha@develop + - mochi-margo@0.9.6 cflags="-std=gnu99" ^libfabric fabrics=rxm,sockets,tcp + - spath~mpi + view: true + concretizer: + unify: true diff --git a/.spack-env/unifyfs-lsf-gcc8_3_1/spack.yaml b/.spack-env/unifyfs-lsf-gcc8_3_1/spack.yaml new file mode 100644 index 000000000..b95ff4556 --- /dev/null +++ b/.spack-env/unifyfs-lsf-gcc8_3_1/spack.yaml @@ -0,0 +1,15 @@ +spack: + packages: + all: + compiler: [gcc@8.3.1] + providers: + mpi: [spectrum-mpi] + target: [ppc64le] + specs: + - automake@1.15.1 + - gotcha@develop + - mochi-margo@0.9.6 ^libfabric fabrics=rxm,sockets,tcp + - spath~mpi + view: true + concretizer: + unify: true diff --git a/.spack-env/unifyfs-slurm-gcc10_2_1/spack.yaml b/.spack-env/unifyfs-slurm-gcc10_2_1/spack.yaml new file mode 100644 index 000000000..71f027c02 --- /dev/null +++ b/.spack-env/unifyfs-slurm-gcc10_2_1/spack.yaml @@ -0,0 +1,15 @@ +spack: + packages: + all: + compiler: [gcc@10.2.1] + providers: + mpi: [mvapich2] + target: [x86_64] + specs: + - automake@1.15.1 + - gotcha@develop + - mochi-margo@0.9.6 ^libfabric fabrics=rxm,sockets,tcp + - spath~mpi + view: true + concretizer: + unify: true diff --git a/.spack-env/unifyfs-slurm-gcc4_9_3/spack.yaml b/.spack-env/unifyfs-slurm-gcc4_9_3/spack.yaml new file mode 100644 index 000000000..242658840 --- /dev/null +++ b/.spack-env/unifyfs-slurm-gcc4_9_3/spack.yaml @@ -0,0 +1,15 @@ +spack: + packages: + all: + compiler: [gcc@4.9.3] + providers: + mpi: [mvapich2] + target: [x86_64] + specs: + - automake@1.15.1 + - gotcha@develop + - mochi-margo@0.9.6 cflags="-std=gnu99" ^libfabric fabrics=rxm,sockets,tcp + - spath~mpi + view: true + concretizer: + unify: true diff --git a/docs/dependencies.rst b/docs/dependencies.rst index 2422c145a..97d70d6d5 100644 --- a/docs/dependencies.rst +++ b/docs/dependencies.rst @@ -10,7 +10,7 @@ Required - `GOTCHA `_ version 1.0.3 (or later) -- `Margo `_ version 0.9.6 - version 0.9.9 and its dependencies: +- `Margo `_ version 0.9.6 and its dependencies: - `Argobots `_ version 1.1 (or later) - `Mercury `_ version 2.0.1 (or later) diff --git a/docs/run.rst b/docs/run.rst index dcf7ee220..7dcba228f 100644 --- a/docs/run.rst +++ b/docs/run.rst @@ -79,10 +79,12 @@ The full usage for ``unifyfs`` is as follows: -S, --share-dir= [REQUIRED] shared file system for use by servers -c, --cleanup [OPTIONAL] clean up the UnifyFS storage upon server exit -i, --stage-in= [OPTIONAL] stage in file(s) listed in file + -P, --stage-parallel [OPTIONAL] use parallel stage-in -T, --stage-timeout= [OPTIONAL] timeout for stage-in operation Command options for "terminate": -o, --stage-out= [OPTIONAL] stage out file(s) listed in on termination + -P, --stage-parallel [OPTIONAL] use parallel stage-out -T, --stage-timeout= [OPTIONAL] timeout for stage-out operation -s, --script= [OPTIONAL] to custom termination script -S, --share-dir= [REQUIRED for --stage-out] shared file system for use by servers diff --git a/docs/testing.rst b/docs/testing.rst index 362717922..7e52fee4e 100644 --- a/docs/testing.rst +++ b/docs/testing.rst @@ -315,29 +315,47 @@ normally be run first to start the UnifyFS daemon. E.g., to run just the $ make check TESTS='0001-setup.t 0100-sysio-gotcha.t 9010-stop-unifyfsd.t 9999-cleanup.t' -.. note:: +.. note:: **Running Unit Tests from Spack Install** - If you are using Spack to install UnifyFS then there are two ways to - manually run these tests: + If using Spack to install UnifyFS there are two ways to manually run the + units tests: - 1. Upon your installation with Spack + 1. Upon installation with Spack ``spack install -v --test=root unifyfs`` 2. Manually from Spack's build directory - ``spack install --keep-stage unifyfs`` + Open the Spack config file: - ``spack cd unifyfs`` + ``spack config edit config`` + + Provide Spack a staging path that is visible from a job allocation: + + .. code-block:: yaml + + config: + build_stage: + - /visible/path/allocated/node + # or build directly inside Spack's install directory + - $spack/var/spack/stage - ``cd spack-build/t`` + Include the ``--keep-stage`` option when installing: - ``make check`` + ``spack install --keep-stage unifyfs`` + + ``spack cd unifyfs`` + + ``cd spack-build/t`` + + Run the tests from the package's build environment: + + ``spack build-env unifyfs make check`` The tests in https://github.com/LLNL/UnifyFS/tree/dev/t are run automatically -using `GitHub Actions`_ along with the :ref:`style checks ` when a -pull request is created or updated. All pull requests must pass these tests -before they will be accepted. +using `GitHub Actions`_ along with the :ref:`style checks ` +when a pull request is created or updated. All pull requests must pass these +tests before they will be accepted. Interpreting the Results ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -600,11 +618,11 @@ Misc """" ``KB`` - :math:`2^{10}`. + :math:`2^{10}` ``MB`` - :math:`2^{20}`. + :math:`2^{20}` ``GB`` - :math:`2^{30}`. + :math:`2^{30}` ------------ @@ -614,33 +632,56 @@ Running the Tests .. attention:: UnifyFS's integration test suite requires MPI and currently only supports - ``srun`` and ``jsrun`` MPI launch commands. Changes are coming to support - ``mpirun``. + ``srun`` and ``jsrun`` MPI launch commands. -UnifyFS's integration tests are primarly set up to be run all as one suite. -However, they can be run individually if desired. +UnifyFS's integration tests are primarly set up to run distinct suites of tests, +however they can also all be run at once or manually for more fine-grained +control. The testing scripts in `t/ci`_ depend on sharness_, which is set up in the containing *t/* directory. These tests will not function properly if moved or if -they cannot find the sharness files. +the sharness files cannot be found. -Whether running all tests or individual tests, first make sure you have -either interactively allocated nodes or are submitting a batch job to run -them. +Before running any tests, ensure either compute nodes have been interactively +allocated or run via a batch job submission. Make sure all :ref:`dependencies ` are installed and loaded. -.. note:: +The *t/ci/RUN_CI_TESTS.sh* script is designed to simplify running various suites +of tests. - In order to run the the integration tests from a Spack_ installation of - UnifyFS, you'll need to tell Spack to use a different location for staging - builds in order to have the source files available from inside an allocation. +.. rubric:: ``RUN_CI_TESTS.sh`` Script - Open your Spack config file +.. code-block:: Bash + + Usage: ./RUN_CI_TESTS.sh [-h] -s {all|[writeread,[write|read],pc,stage]} -t {all|[posix,mpiio]} + + Any previously set UnifyFS environment variables will take precedence. + + Options: + -h, --help + Print this help message + + -s, --suite {all|[writeread,[write|read],pc,stage]} + Select the test suite(s) to be run + Takes a comma-separated list of available suites + + -t, --type {all|[posix,mpiio]} + Select the type(s) of each suite to be run + Takes a comma-separated list of available types + Required with --suite unless stage is the only suite selected + +.. note:: **Running Integration Tests from Spack Build** + + Running the integration tests from a Spack_ installation of UnifyFS requires + telling Spack to use a different location for staging the build in order to + have the source files available from inside a job allocation. + + Open the Spack config file: ``spack config edit config`` - and provide a path that is visible during job allocations: + Provide a staging path that is visible to all nodes from a job allocations: .. code-block:: yaml @@ -650,42 +691,104 @@ Make sure all :ref:`dependencies ` are installed and loaded. # or build directly inside Spack's install directory - $spack/var/spack/stage - Then make sure to include the ``--keep-stage`` option when installing: + Include the ``--keep-stage`` option when installing: ``spack install --keep-stage unifyfs`` -Running All Tests + Allocate compute nodes and spawn a new shell containing the package's build + environment: + + ``spack build-env unifyfs bash`` + + Run the integration tests: + + ``spack load unifyfs`` + + ``spack cd unifyfs`` + + ``cd t/ci`` + + # Run tests using any of the following formats + +Individual Suites ^^^^^^^^^^^^^^^^^ -To run all of the tests, simply run ``./RUN_CI_TESTS.sh``. +To run individual test suites, indicate the desired suite(s) and type(s) when +running *RUN_CI_TESTS.sh*. E.g.: .. code-block:: BASH - $ ./RUN_CI_TESTS.sh + $ ./RUN_CI_TESTS.sh -s writeread -t mpiio or .. code-block:: BASH - $ prove -v RUN_CI_TESTS.sh + $ prove -v RUN_CI_TESTS.sh :: -s writeread -t mpiio -Running Individual Tests -^^^^^^^^^^^^^^^^^^^^^^^^ +The ``-s|--suite`` and ``-t|--type`` options flag which set(s) of tests to run. +Each suite (aside from ``stage``) requires a type to be selected as well. Note +that if ``all`` is selected, the other arguments are redundant. If the ``read`` +suite is selected, then the ``write`` argument is redundant. + +Available suites: all|[writeread,[write,read],pc,stage] + all: run all suites + writeread: run writeread tests + write: run write tests only (redundant if read also set) + read: run write then read tests (all-hosts producer-consumer tests) + pc: run producer-consumer tests (disjoint sets of hosts) + stage: run stage tests (type not required) + +Available types: all|[posix,mpiio] + all: run all types + posix: run posix versions of above suites + mpiio: run mpiio versions of above suites + +All Tests +^^^^^^^^^ + +.. warning:: + + If running all or most tests within a single allocation, a large amount of + time and storage space will be required. Even if enough of both are available, + it is still possible the run may hit other limitations (e.g., + ``client_max_files``, ``client_max_active_requests``, + ``server_max_app_clients``). To avoid this, run individual suites from + separate job allocations. + +To run all of the tests, run *RUN_CI_TESTS.sh* with the all suites and types +options. + +.. code-block:: BASH + + $ ./RUN_CI_TESTS.sh -s all -t all + +or + +.. code-block:: BASH + + $ prove -v RUN_CI_TESTS.sh :: -s all -t all + +Subsets of Individual Suites +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In order to run individual tests, the testing functions and variables need to be -set up first and then the UnifyFS server needs to be started. +Subsets of individual test suites can be run manually. This can be useful when +wanting more fine-grained control or for testing a specific configuration. To +run manually, the testing functions and variables need to be set up first and +then the UnifyFS servers need to be started. First source the *t/ci/001-setup.sh* script whereafter sharness will change -directories to the ``$SHARNESS_TRASH_DIRECTORY``. To account for this, source -*002-start-server.sh* and each desired test script after that prefixed with -``$UNIFYFS_CI_DIR/``. When finished, source the *990-stop-server.sh* script -last to stop the server and clean up. +directories to the ``$SHARNESS_TRASH_DIRECTORY``. To account for this, prefix +each subsequent script with ``$UNIFYFS_CI_DIR/`` when sourcing. Start the +servers next by sourcing *002-start-server.sh* followed by each desired test +script. When finished, source *990-stop-server.sh* last to stop the servers, +report the results, and clean up. .. code-block:: BASH $ . ./001-setup.sh $ . $UNIFYFS_CI_DIR/002-start-server.sh - $ . $UNIFYFS_CI_DIR/100-writeread-tests.sh + $ . $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --shuffle --mpiio $ . $UNIFYFS_CI_DIR/990-stop-server.sh The various CI test suites can be run multiple times with different behaviors. @@ -700,6 +803,7 @@ additional information for that particular suite. options: -h, --help print help message + -l, --laminate laminate between writing and reading -M, --mpiio use MPI-IO instead of POSIX I/O -x, --shuffle read different data than written @@ -753,7 +857,7 @@ the appropriate -m if posix test or not). The stderr output file is also created (based on the filename that is autogenerated) and the appropriate option is set for the MPI job run command. -Args that can be passed in are ([-pncbx][-A|-M|-N|-P|-S|-V]). All other args +Args that can be passed in are ([-cblnpx][-A|-L|-M|-N|-P|-S|-V]). All other args (see :ref:`Running the Examples `) are set automatically, including the outfile and filename (which are generated based on the input ``$app_name`` and ``$app_args``). diff --git a/t/ci/001-setup.sh b/t/ci/001-setup.sh index f445a0033..74f7ddfaa 100755 --- a/t/ci/001-setup.sh +++ b/t/ci/001-setup.sh @@ -1,54 +1,34 @@ #!/bin/sh -# This script checks for an installation of UnifyFS (either with Spack or in -# $HOME/UnifyFS/install) and then sets up variables needed for testing. -# -# All of this is done in this script so that tests can be run individually if -# desired. To run all tests simply run the RUN_TESTS.sh script. If Individual -# tests are desired to be run, source the 001-setup.sh script first, followed by -# 002-start-server.sh. Then source each desired script after that preceded by -# `$UNIFYFS_CI_DIR`. When finished, source the 990-stop-server.sh script last. -# -# E.g.: -# $ . full/path/to/001-setup.sh -# $ . $UNIFYFS_CI_DIR/002-start-server.sh -# $ . $UNIFYFS_CI_DIR/100-writeread-tests.sh -# $ . $UNIFYFS_CI_DIR/990-stop-server.sh -# -# To run all of the tests, simply run RUN_CI_TESTS.sh -# -# E.g.: -# $ ./RUN_CI_TESTS.sh -# or -# $ prove -v RUN_CI_TESTS.sh -# -# Before doing either of these, make sure you have interactively allocated nodes -# or are submitting a batch job. +# This script checks for an installation of UnifyFS (module/spack loaded, in the +# UNIFYFS_INSTALL envar, or in the parent directory of this source code) and +# then sets up variables needed for testing. test_description="Set up UnifyFS testing environment" SETUP_USAGE="$(cat < ${UNIFYFS_LOG_DIR}/unifyfs.start.out else - $UNIFYFS_CLU start -c -d -S $UNIFYFS_SHAREDFS_DIR \ + $UNIFYFS_CLU start -d -S $UNIFYFS_SHAREDFS_DIR \ &> ${UNIFYFS_LOG_DIR}/unifyfs.start.out fi diff --git a/t/ci/100-writeread-tests.sh b/t/ci/100-writeread-tests.sh index 753950e64..b8c83360c 100755 --- a/t/ci/100-writeread-tests.sh +++ b/t/ci/100-writeread-tests.sh @@ -40,6 +40,7 @@ usage ./100-writeread-tests.sh [options] options: -h, --help print this (along with overall) help message + -l, --laminate laminate between writing and reading -M, --mpiio use MPI-IO instead of POSIX I/O -x, --shuffle read different data than written @@ -51,6 +52,8 @@ was built with (static, gotcha, and optionally posix). Providing available options can change the default I/O behavior and/or I/O type used. The varying I/O types are mutually exclusive options and thus only one should be provided at a time. + +For more information on manually running tests, run './001-setup.sh -h'. EOF )" @@ -60,7 +63,6 @@ do -h|--help) echo "$WRITEREAD_USAGE" ci_dir=$(dirname "$(readlink -fm $BASH_SOURCE)") - $ci_dir/001-setup.sh -h exit ;; -l|--laminate) diff --git a/t/ci/110-write-tests.sh b/t/ci/110-write-tests.sh index da946d0eb..72681e5ab 100755 --- a/t/ci/110-write-tests.sh +++ b/t/ci/110-write-tests.sh @@ -42,6 +42,7 @@ usage ./110-write-tests.sh [options] options: -h, --help print this (along with overall) help message + -l, --laminate laminate after finished writing -M, --mpiio use MPI-IO instead of POSIX I/O Run a series of tests on the UnifyFS write example application. By default, a @@ -52,6 +53,8 @@ was built with (static, gotcha, and optionally posix). Providing available options can change the default I/O behavior and/or I/O type used. The varying I/O types are mutually exclusive options and thus only one should be provided at a time. + +For more information on manually running tests, run './001-setup.sh -h'. EOF )" @@ -61,7 +64,6 @@ do -h|--help) echo "$WRITE_USAGE" ci_dir=$(dirname "$(readlink -fm $BASH_SOURCE)") - $ci_dir/001-setup.sh -h exit ;; -l|--laminate) diff --git a/t/ci/120-read-tests.sh b/t/ci/120-read-tests.sh index b8aae08de..bd19cd66a 100755 --- a/t/ci/120-read-tests.sh +++ b/t/ci/120-read-tests.sh @@ -42,6 +42,7 @@ usage ./120-read-tests.sh [options] options: -h, --help print this (along with overall) help message + -l, --laminate read the corresponding laminated file from write -M, --mpiio use MPI-IO instead of POSIX I/O The write tests (110-write-tests.sh) need to be run first with the same options @@ -55,6 +56,8 @@ was built with (static, gotcha, and optionally posix). Providing available options can change the default I/O behavior and/or I/O type used. The varying I/O types are mutually exclusive options and thus only one should be provided at a time. + +For more information on manually running tests, run './001-setup.sh -h'. EOF )" @@ -64,7 +67,6 @@ do -h|--help) echo "$READ_USAGE" ci_dir=$(dirname "$(readlink -fm $BASH_SOURCE)") - $ci_dir/001-setup.sh -h exit ;; -l|--laminate) diff --git a/t/ci/300-producer-consumer-tests.sh b/t/ci/300-producer-consumer-tests.sh index a58325edf..bae4579de 100755 --- a/t/ci/300-producer-consumer-tests.sh +++ b/t/ci/300-producer-consumer-tests.sh @@ -43,6 +43,7 @@ usage ./300-producer-consumer-tests.sh [options] options: -h, --help print this (along with overall) help message + -l, --laminate laminate files between writing and reading -M, --mpiio use MPI-IO instead of POSIX I/O Run a series of producer-consumer workload tests using the UnifyFS write and @@ -57,6 +58,8 @@ should be provided at a time. Only run this suite when using two or more hosts. Use 110-write-tests.sh followed by 120-read-tests.sh when using a single host as it will be equivalent in this case. + +For more information on manually running tests, run './001-setup.sh -h'. EOF )" @@ -66,7 +69,6 @@ do -h|--help) echo "$PRODUCER_CONSUMER_USAGE" ci_dir=$(dirname "$(readlink -fm $BASH_SOURCE)") - $ci_dir/001-setup.sh -h exit ;; -l|--laminate) diff --git a/t/ci/RUN_CI_TESTS.sh b/t/ci/RUN_CI_TESTS.sh index 477e92080..7a8a37113 100755 --- a/t/ci/RUN_CI_TESTS.sh +++ b/t/ci/RUN_CI_TESTS.sh @@ -1,79 +1,190 @@ #!/bin/sh -# This script is to run the entire integration test suite of TAP tests for -# Unify. -# In order to run individual tests, run `./001-setup.sh -h`. -# -# To run all of the tests, simply run RUN_CI_TESTS.sh. -# -# E.g.: -# $ ./RUN_CI_TESTS.sh -# or -# $ prove -v RUN_CI_TESTS.sh -# -# If individual tests are desired to be run, source the 001-setup.sh script -# first, followed by 002-start-server.sh. Then source each desired script after -# that preceded by `$UNIFYFS_CI_DIR`. When finished, source the -# 990-stop-server.sh script last. -# -# E.g.: -# $ . full/path/to/001-setup.sh -# $ . $UNIFYFS_CI_DIR/002-start-server.sh -# $ . $UNIFYFS_CI_DIR/100-writeread-tests.sh -# $ . $UNIFYFS_CI_DIR/990-stop-server.sh -# -# Before doing either of these, make sure you have interactively allocated nodes -# or are submitting a batch job. -# -# If additional tests are desired, create a script after the fashion of -# 100-writeread-tests.sh where the prefixed number indicates the desired order -# for running the tests. Then source that script in this script below, in the -# desired order. - test_description="Unify Integration Testing Suite" RUN_CI_TESTS_USAGE="$(cat < /dev/null +if [ $? != 4 ]; then + echo "ERROR: `getopt --test` failed in this environment" + exit 1 +fi + +# At least one parameter is required. Error out if none detected +if [ "$#" -lt "1" ]; then + echo "ERROR: No input parameters detected. At least one parameter required" + echo "Run './RUN_CI_TESTS.sh -h' for help" + exit 1 +fi + +# Set up short and long options +SHORTOPTS=hs:t: +LONGOPTS=help,suite:,type: +TEMP=$(getopt -o $SHORTOPTS -l $LONGOPTS --name "$0" -- "$@") +if [ $? != 0 ]; then + # getopt has complained about wrong arguments to stdout + echo "Run './RUN_CI_TESTS.sh -h' for help" + exit 1 +fi + +eval set -- "$TEMP" + +# Suite and type envars for determining which tests to run, initialized to false +suite_envars=(WRITEREAD WRITE READ PC STAGE) +for i in "${suite_envars[@]}"; do export ${i}=false; done +type_envars=(POSIX MPIIO) +for j in "${type_envars[@]}"; do export ${j}=false; done + +SUITE= +while true; do + case "$1" in + -h|--help ) echo "$RUN_CI_TESTS_USAGE"; exit ;; + -s|--suite ) + SUITE="set" + suites=(${2//,/ }) + shift 2 + ;; + -t|--type ) + [ "$SUITE" != "set" ] && + { echo "USAGE ERROR: -t|--type requested but no -s|--suite selected" + echo "Run './RUN_CI_TESTS.sh -h' for help" exit 1 - ;; - esac + } || types=(${2//,/ }) + shift 2 + ;; + -- ) shift; break ;; + * ) echo "Unknown option error"; exit 2 ;; + esac done +# Verify desired suite(s)/type(s) are valid and set flags for which tests to run +if [[ ${#suites[@]} -eq 1 && "${suites[0]}" == "stage" ]]; then + # Stage was only suite selected so type doesn't matter + export STAGE=true +else + # All suites (except stage) require a type to be specified + if [[ ${#types[@]} -eq 0 ]]; then + echo "ERROR: Requested suite(s) requires a -t|--type to be designated" + echo "Run './RUN_CI_TESTS.sh -h' for help" + exit 1 + fi + + # If "all" then other values are redundant. If read then write is redundant + for s in "${suites[@]}"; do + # Ensure suite is a valid value + if [[ ! "$s" =~ ^(all|writeread|write|read|pc|stage)$ ]]; then + echo "Error: suite arg ($s) not recognized" + echo "Run './RUN_CI_TESTS.sh -h' for help" + exit 1 + elif [ "$s" == "all" ]; then # Flag all suites to be run + for i in "${suite_envars[@]}"; do export ${i^^}=true; done + else # Flag this suite to be run + export ${s^^}=true + fi + done + + # If "all" then other values are redundant + for t in "${types[@]}"; do + # Ensure type is a valid value + if [[ ! "$t" =~ ^(all|posix|mpiio)$ ]]; then + echo "Error: type arg ($t) not recognized" + echo "Run './RUN_CI_TESTS.sh -h' for help" + exit 1 + elif [ "$t" == "all" ]; then # Flag all types to be run + for j in "${type_envars[@]}"; do export ${j^^}=true; done + else # Flag this type to be run + export ${t^^}=true + fi + done +fi + + +### Start running tests ### + SECONDS=0 start_time=$SECONDS echo "Started RUN_CI_TESTS.sh @: $(date)" @@ -81,8 +192,8 @@ echo "Started RUN_CI_TESTS.sh @: $(date)" # Set up UNIFYFS_CI_DIR if this script is called first UNIFYFS_CI_DIR=${UNIFYFS_CI_DIR:-"$(dirname "$(readlink -fm $BASH_SOURCE)")"} -# test_done gets called in 990-stop-server.sh if this is not set. # If not set, tests can be run individually +# test_done gets called in 990-stop-server.sh if this is not set. UNIFYFS_CI_TESTS_FULL_RUN=true # setup testing @@ -96,79 +207,129 @@ setup_time=$SECONDS echo "Setup time -- $(elapsed_time start_time setup_time)" ############################################################################## -# Add additional testing files between here and the final testing time (before -# 990-stop-server.sh) in the desired order to run them. +# If additional tests are desired, create a script after the fashion of +# 100-writeread-tests.sh where the prefixed number indicates the desired order +# for running the tests. Then source that script between here and the final +# testing time (before 990-stop-server.sh) in the desired order to run them. ############################################################################## -### POSIX-IO tests ### - -# POSIX-IO writeread example tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate - -# POSIX-IO writeread example with I/O shuffle tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --shuffle - -# POSIX-IO writeread example w/out laminate tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh - -# POSIX-IO writeread example w/out laminate tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh --shuffle - -# POSIX-IO write example tests -source $UNIFYFS_CI_DIR/110-write-tests.sh --laminate - -# POSIX-IO read example tests -source $UNIFYFS_CI_DIR/120-read-tests.sh --laminate - -# POSIX-IO write example w/out laminate tests -source $UNIFYFS_CI_DIR/110-write-tests.sh - -# POSIX-IO read example w/out laminate tests -source $UNIFYFS_CI_DIR/120-read-tests.sh - -### MPI-IO tests ### - -# MPI-IO writeread example tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --mpiio - -# MPI-IO writeread example with I/O shuffle tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --shuffle --mpiio - -# MPI-IO writeread example w/out laminate tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh --mpiio - -# MPI-IO writeread example w/out laminate tests -source $UNIFYFS_CI_DIR/100-writeread-tests.sh --shuffle --mpiio - -# MPI-IO write example tests -source $UNIFYFS_CI_DIR/110-write-tests.sh --laminate --mpiio - -# MPI-IO read example tests -source $UNIFYFS_CI_DIR/120-read-tests.sh --laminate --mpiio - -# MPI-IO write example w/out laminate tests -source $UNIFYFS_CI_DIR/110-write-tests.sh --mpiio - -# MPI-IO read example w/out laminate tests -source $UNIFYFS_CI_DIR/120-read-tests.sh --mpiio - -### Producer-Consumer workload tests ### - -# POSIX-IO producer-consumer tests -source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --laminate - -# POSIX-IO producer-consumer w/out laminate tests -source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh - -# MPI-IO producer-consumer tests -source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --laminate --mpiio - -# MPI-IO producer-consumer w/out laminate tests -source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --mpiio - -### unifyfs-stage tests ### - -source $UNIFYFS_CI_DIR/800-stage-tests.sh +##### WRITEREAD test suite ##### + +if [ "$WRITEREAD" == "true" ]; then + ### WRITEREAD-POSIX test suite ### + if [ "$POSIX" == true ]; then + echo "Running WRITEREAD-POSIX test suite" + # POSIX-IO writeread example tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate + + # POSIX-IO writeread example with I/O shuffle tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --shuffle + + # POSIX-IO writeread example w/out laminate tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh + + # POSIX-IO writeread example w/out laminate tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh --shuffle + echo "Finished WRITEREAD-POSIX test suite" + fi + + ### WRITEREAD-MPIIO test suite ### + if [ "$MPIIO" == "true" ]; then + echo "Running WRITEREAD-MPIIO test suite" + # MPI-IO writeread example tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --mpiio + + # MPI-IO writeread example with I/O shuffle tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh --laminate --shuffle --mpiio + + # MPI-IO writeread example w/out laminate tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh --mpiio + + # MPI-IO writeread example w/out laminate tests + source $UNIFYFS_CI_DIR/100-writeread-tests.sh --shuffle --mpiio + echo "Finished WRITEREAD-MPIIO test suite" + fi +fi + +##### WRITE/READ test suite ##### +# Producer-consumer tests on all hosts + +if [ "$WRITE" == "true" ] || [ "$READ" == "true" ]; then + if [ "$POSIX" == true ]; then + echo "Running WRITE/READ-POSIX test suite" + # posix-io write example tests + source $UNIFYFS_CI_DIR/110-write-tests.sh --laminate + + if [ "$READ" == "true" ]; then + # posix-io read example tests + source $UNIFYFS_CI_DIR/120-read-tests.sh --laminate + fi + + # posix-io write example w/out laminate tests + source $UNIFYFS_CI_DIR/110-write-tests.sh + + if [ "$READ" == "true" ]; then + # posix-io read example w/out laminate tests + source $UNIFYFS_CI_DIR/120-read-tests.sh + fi + echo "Finished WRITE/READ-POSIX test suite" + fi + + if [ "$MPIIO" == "true" ]; then + echo "Running WRITE/READ-MPIIO test suite" + # MPI-IO write example tests + source $UNIFYFS_CI_DIR/110-write-tests.sh --laminate --mpiio + + if [ "$READ" == "true" ]; then + # MPI-IO read example tests + source $UNIFYFS_CI_DIR/120-read-tests.sh --laminate --mpiio + fi + + # MPI-IO write example w/out laminate tests + source $UNIFYFS_CI_DIR/110-write-tests.sh --mpiio + + if [ "$READ" == "true" ]; then + # MPI-IO read example w/out laminate tests + source $UNIFYFS_CI_DIR/120-read-tests.sh --mpiio + fi + echo "Finished WRITE/READ-MPIIO test suite" + fi +fi + +##### Producer-Consumer workload test suite ##### +# Producer-consumer tests disjoint sets of hosts + +if [ "$PC" == "true" ]; then + ### PC-POSIX test suite ### + if [ "$POSIX" == true ]; then + echo "Running PC-POSIX test suite" + # POSIX-IO producer-consumer tests + source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --laminate + + # POSIX-IO producer-consumer w/out laminate tests + source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh + echo "Finished PC-POSIX test suite" + fi + + ### PC-MPIIO test suite ### + if [ "$MPIIO" == "true" ]; then + echo "Running PC-MPIIO test suite" + # MPI-IO producer-consumer tests + source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --laminate --mpiio + + # MPI-IO producer-consumer w/out laminate tests + source $UNIFYFS_CI_DIR/300-producer-consumer-tests.sh --mpiio + echo "Finished PC-MPIIO test suite" + fi +fi + +##### unifyfs-stage test suite ##### + +if [ "$STAGE" == "true" ]; then + echo "Running STAGE test suite" + source $UNIFYFS_CI_DIR/800-stage-tests.sh + echo "Finished STAGE test suite" +fi ############################################################################## # DO NOT add additional tests after this point diff --git a/t/ci/setup-lsf.sh b/t/ci/setup-lsf.sh index b8f0633d6..041a6c60c 100755 --- a/t/ci/setup-lsf.sh +++ b/t/ci/setup-lsf.sh @@ -71,7 +71,7 @@ jsargs="-a ${nprocs} -c ${ncores} -r ${nrs_per_node}" app_out="-o" app_err="-k" JOB_RUN_COMMAND="jsrun $jsargs -n ${nres_sets}" -JOB_RUN_ONCE_PER_NODE="jsrun -r1" +JOB_RUN_ONCE_PER_NODE="jsrun -r 1" JOB_ID=${JOB_ID:-$LSB_JOBID} # Set up producer-consumer variables and functions when using two or more hosts @@ -105,5 +105,5 @@ fi echo "$infomsg ====================== LSF Job Info ======================" echo "$infomsg ----------------------- Job Status -----------------------" -bjobs -l $JOB_ID +bquery -l $JOB_ID echo "$infomsg ----------------------------------------------------------" diff --git a/t/ci/setup-slurm.sh b/t/ci/setup-slurm.sh index da40d080a..4ecbd565e 100755 --- a/t/ci/setup-slurm.sh +++ b/t/ci/setup-slurm.sh @@ -63,5 +63,5 @@ fi echo "$infomsg ====================== SLURM Job Info ======================" echo "$infomsg ------------------------ Job Status ------------------------" -checkjob -v $JOB_ID +scontrol show job $JOB_ID echo "$infomsg ------------------------------------------------------------" From 587f200876d5528c4cb3aa8187e680c304d17f88 Mon Sep 17 00:00:00 2001 From: CamStan Date: Wed, 17 Aug 2022 15:23:06 -0700 Subject: [PATCH 80/81] Remove consistency model config; 1.0 doc updates Remove consistency model configuration code as this is an old placeholder and UnifyFS will likely never claim POSIX compliance. Update docs to prepare to 1.0 release. - Update docs metadata - Add limitation paragraph on directory support - Remove mentions of the consistency model config option - Minor adjustments for readability --- common/src/Makefile.mk | 2 - common/src/cm_enumerator.c | 93 ------------------------------- common/src/cm_enumerator.h | 76 ------------------------- common/src/unifyfs_configurator.h | 1 - docs/api.rst | 16 +++--- docs/conf.py | 6 +- docs/configuration.rst | 3 - docs/limitations.rst | 4 ++ docs/run.rst | 5 +- util/unifyfs/src/unifyfs-rm.c | 9 --- util/unifyfs/src/unifyfs.c | 13 ----- util/unifyfs/src/unifyfs.h | 2 - 12 files changed, 17 insertions(+), 213 deletions(-) delete mode 100644 common/src/cm_enumerator.c delete mode 100644 common/src/cm_enumerator.h diff --git a/common/src/Makefile.mk b/common/src/Makefile.mk index 7842bafac..d7ce671af 100644 --- a/common/src/Makefile.mk +++ b/common/src/Makefile.mk @@ -6,8 +6,6 @@ UNIFYFS_COMMON_INSTALL_HDRS = \ UNIFYFS_COMMON_BASE_SRCS = \ %reldir%/arraylist.h \ %reldir%/arraylist.c \ - %reldir%/cm_enumerator.h \ - %reldir%/cm_enumerator.c \ %reldir%/ini.h \ %reldir%/ini.c \ %reldir%/rm_enumerator.h \ diff --git a/common/src/cm_enumerator.c b/common/src/cm_enumerator.c deleted file mode 100644 index 6cb9c9f3c..000000000 --- a/common/src/cm_enumerator.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* Copyright (c) 2018 - Michael J. Brim - * - * Enumerator is part of https://github.com/MichaelBrim/tedium - * - * MIT License - See LICENSE.tedium - */ - -#include "cm_enumerator.h" -#include -#include -#include - -/* c-strings for enum names */ - -#define ENUMITEM(name, desc) \ - const char *UNIFYFS_CM_ENUM_ ## name ## _NAME_STR = #name; -UNIFYFS_CM_ENUMERATOR -#undef ENUMITEM - -const char *unifyfs_cm_enum_str(unifyfs_cm_e e) -{ - switch (e) { - case UNIFYFS_CM_INVALID: - return "UNIFYFS_CM_INVALID"; -#define ENUMITEM(name, desc) \ - case UNIFYFS_CM_ ## name: \ - return UNIFYFS_CM_ENUM_ ## name ## _NAME_STR; - UNIFYFS_CM_ENUMERATOR -#undef ENUMITEM - default : - break; - } - return NULL; -} - -/* c-strings for enum descriptions */ - -#define ENUMITEM(name, desc) \ - const char *UNIFYFS_CM_ENUM_ ## name ## _DESC_STR = #desc; -UNIFYFS_CM_ENUMERATOR -#undef ENUMITEM - -const char *unifyfs_cm_enum_description(unifyfs_cm_e e) -{ - switch (e) { - case UNIFYFS_CM_INVALID: - return "invalid unifyfs_cm_e value"; -#define ENUMITEM(name, desc) \ - case UNIFYFS_CM_ ## name: \ - return UNIFYFS_CM_ENUM_ ## name ## _DESC_STR; - UNIFYFS_CM_ENUMERATOR -#undef ENUMITEM - default : - break; - } - return NULL; -} - -unifyfs_cm_e unifyfs_cm_enum_from_str(const char *s) -{ - if (0) - ; -#define ENUMITEM(name, desc) \ - else if (strcmp(s, #name) == 0) \ - return UNIFYFS_CM_ ## name; - UNIFYFS_CM_ENUMERATOR; -#undef ENUMITEM - - return UNIFYFS_CM_INVALID; -} - -/* validity check */ - -int check_valid_unifyfs_cm_enum(unifyfs_cm_e e) -{ - return ((e > UNIFYFS_CM_INVALID) && - (e < UNIFYFS_CM_ENUM_MAX) && - (unifyfs_cm_enum_str(e) != NULL)); -} diff --git a/common/src/cm_enumerator.h b/common/src/cm_enumerator.h deleted file mode 100644 index 2f1744cfa..000000000 --- a/common/src/cm_enumerator.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2020, Lawrence Livermore National Security, LLC. - * Produced at the Lawrence Livermore National Laboratory. - * - * Copyright 2020, UT-Battelle, LLC. - * - * LLNL-CODE-741539 - * All rights reserved. - * - * This is the license for UnifyFS. - * For details, see https://github.com/LLNL/UnifyFS. - * Please read https://github.com/LLNL/UnifyFS/LICENSE for full license text. - */ - -/* Copyright (c) 2018 - Michael J. Brim - * - * Enumerator is part of https://github.com/MichaelBrim/tedium - * - * MIT License - See LICENSE.tedium - */ - -#ifndef _UNIFYFS_CM_ENUMERATOR_H_ -#define _UNIFYFS_CM_ENUMERATOR_H_ - -/** - * @brief enumerator list expanded many times with varied ENUMITEM() definitions - * - * @param item name - * @param item short description - */ -#define UNIFYFS_CM_ENUMERATOR \ - ENUMITEM(NONE, "no consistency") \ - ENUMITEM(LAMINATED, "UnifyFS laminated consistency model") \ - ENUMITEM(POSIX, "POSIX I/O consistency model") - -#ifdef __cplusplus -extern "C" { -#endif - -/** - * @brief supported consistency models - */ -typedef enum { - UNIFYFS_CM_INVALID = 0, -#define ENUMITEM(name, desc) \ - UNIFYFS_CM_ ## name, - UNIFYFS_CM_ENUMERATOR -#undef ENUMITEM - UNIFYFS_CM_ENUM_MAX -} unifyfs_cm_e; - -/** - * @brief get C-string for given consistency model enum value - */ -const char *unifyfs_cm_enum_str(unifyfs_cm_e e); - -/** - * @brief get description for given consistency model enum value - */ -const char *unifyfs_cm_enum_description(unifyfs_cm_e e); - -/** - * @brief check validity of given consistency model enum value - */ -int check_valid_unifyfs_cm_enum(unifyfs_cm_e e); - -/** - * @brief get enum value for given consistency model C-string - */ -unifyfs_cm_e unifyfs_cm_enum_from_str(const char *s); - -#ifdef __cplusplus -} /* extern C */ -#endif - -#endif /* UNIFYFS_CM_ENUMERATOR_H */ diff --git a/common/src/unifyfs_configurator.h b/common/src/unifyfs_configurator.h index 8c1f147b2..34870f56d 100644 --- a/common/src/unifyfs_configurator.h +++ b/common/src/unifyfs_configurator.h @@ -66,7 +66,6 @@ #define UNIFYFS_CONFIGS \ UNIFYFS_CFG_CLI(unifyfs, cleanup, BOOL, off, "cleanup storage on server exit", NULL, 'C', "on|off") \ UNIFYFS_CFG_CLI(unifyfs, configfile, STRING, /etc/unifyfs.conf, "path to configuration file", configurator_file_check, 'f', "specify full path to config file") \ - UNIFYFS_CFG_CLI(unifyfs, consistency, STRING, LAMINATED, "consistency model", NULL, 'c', "specify consistency model (NONE | LAMINATED | POSIX)") \ UNIFYFS_CFG_CLI(unifyfs, daemonize, BOOL, off, "enable server daemonization", NULL, 'D', "on|off") \ UNIFYFS_CFG_CLI(unifyfs, mountpoint, STRING, /unifyfs, "mountpoint directory", NULL, 'm', "specify full path to desired mountpoint") \ UNIFYFS_CFG(client, cwd, STRING, NULLSTRING, "current working directory", NULL) \ diff --git a/docs/api.rst b/docs/api.rst index 79d593f63..d49846343 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -4,6 +4,14 @@ Integrate the UnifyFS API This section describes how to use the UnifyFS API in an application. +.. rubric:: Transparent Mount Caveat + +MPI applications that take advantage of the :ref:`transparent mounting +` feature (through configuring with ``--enable-mpi-mount`` or +with ``+auto-mount`` through Spack) do not need to be modified in any way in +order to use UnifyFS. Move on to the :doc:`link` section next as this step can +be skipped. + .. Attention:: **Fortran Compatibility** ``unifyfs_mount`` and ``unifyfs_unmount`` are usable with GFortran. @@ -15,14 +23,6 @@ This section describes how to use the UnifyFS API in an application. include the ``+fortran`` variant, or configure UnifyFS with the ``--enable-fortran`` option if building manually. -.. rubric:: Transparent Mount Caveat - -MPI applications that take advantage of the :ref:`transparent mounting -` feature (through configuring with ``--enable-mpi-mount`` or -with ``+auto-mount`` through Spack) do not need to be modified in any way in -order to use UnifyFS. Move on to the :doc:`link` section next as this step can -be skipped. - ----- -------------------------- diff --git a/docs/conf.py b/docs/conf.py index 1f0ac6a60..c6ab1b220 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -47,16 +47,16 @@ # General information about the project. project = u'UnifyFS' copyright = u'2020, Lawrence Livermore National Security LLC, LLNL-CODE-741539, UT-Batelle LLC' -author = u'Kathryn Mohror, Adam Moody, Oral Sarp, Feiyi Wang, Hyogi Sim, Swen Boehm, Michael Brim, Danielle Sikich, Joseph Moore, Ned Bass, Tony Hutter, Celso Mendes, Craig Steffen, Cameron Stanavige' +author = u'Kathryn Mohror, Adam Moody, Sarp Oral, Feiyi Wang, Hyogi Sim, Swen Boehm, Michael Brim, Jenna DeLozier, Ross Miller, Danielle Sikich, Joseph Moore, Ned Bass, Tony Hutter, Hariharan Devarajan, Cameron Stanavige, Celso Mendes, Craig Steffen' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = u'0.9.1' +version = u'1.0' # The full version, including alpha/beta/rc tags. -release = u'0.9.1' +release = u'1.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/configuration.rst b/docs/configuration.rst index a61f11c28..7e18c08db 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -55,7 +55,6 @@ a given section and key. ============= ====== =============================================== cleanup BOOL cleanup storage on server exit (default: off) configfile STRING path to custom configuration file - consistency STRING consistency model [ LAMINATED | POSIX | NONE ] daemonize BOOL enable server daemonization (default: off) mountpoint STRING mountpoint path prefix (default: /unifyfs) ============= ====== =============================================== @@ -186,7 +185,6 @@ is used, the value must immediately follow the option character (e.g., ``-Cyes`` ========================= ======== ``--unifyfs-cleanup`` ``-C`` ``--unifyfs-configfile`` ``-f`` - ``--unifyfs-consistency`` ``-c`` ``--unifyfs-daemonize`` ``-D`` ``--unifyfs-mountpoint`` ``-m`` ``--log-verbosity`` ``-v`` @@ -197,4 +195,3 @@ is used, the value must immediately follow the option character (e.g., ``-Cyes`` ``--sharedfs-dir`` ``-S`` ``--server-init_timeout`` ``-t`` ========================= ======== - diff --git a/docs/limitations.rst b/docs/limitations.rst index dfde0c5c3..f83523ac7 100644 --- a/docs/limitations.rst +++ b/docs/limitations.rst @@ -29,6 +29,10 @@ some less obvious limitations when using some I/O libraries with UnifyFS corruption. Running the application through :doc:`VerifyIO ` can help determine if any file locking calls are made. +.. rubric:: Directories + +UnifyFS does not currently support directory operations. + ---------- --------------------------- diff --git a/docs/run.rst b/docs/run.rst index 7dcba228f..1677d5540 100644 --- a/docs/run.rst +++ b/docs/run.rst @@ -52,8 +52,8 @@ for further details on customizing the UnifyFS runtime configuration. .. _unifyfs_utility_label: -``unifyfs`` provides command-line options to choose the client mountpoint, -adjust the consistency model, and control stage-in and stage-out of files. +``unifyfs`` provides command-line options to select the shared file system path, +choose the client mountpoint, and control stage-in and stage-out of files. The full usage for ``unifyfs`` is as follows: .. code-block:: Bash @@ -71,7 +71,6 @@ The full usage for ``unifyfs`` is as follows: -h, --help print usage Command options for "start": - -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX) -e, --exe= [OPTIONAL] where unifyfsd is installed -m, --mount= [OPTIONAL] mount UnifyFS at -s, --script= [OPTIONAL] to custom launch script diff --git a/util/unifyfs/src/unifyfs-rm.c b/util/unifyfs/src/unifyfs-rm.c index 76fbab847..d9f2ed18d 100644 --- a/util/unifyfs/src/unifyfs-rm.c +++ b/util/unifyfs/src/unifyfs-rm.c @@ -664,15 +664,6 @@ static size_t construct_server_argv(unifyfs_args_t* args, argc++; } - if (args->consistency != UNIFYFS_CM_LAMINATED) { - if (server_argv != NULL) { - server_argv[argc] = strdup("-c"); - server_argv[argc + 1] = - strdup(unifyfs_cm_enum_str(args->consistency)); - } - argc += 2; - } - if (args->mountpoint != NULL) { if (server_argv != NULL) { server_argv[argc] = strdup("-m"); diff --git a/util/unifyfs/src/unifyfs.c b/util/unifyfs/src/unifyfs.c index cdc30584b..fbb436834 100644 --- a/util/unifyfs/src/unifyfs.c +++ b/util/unifyfs/src/unifyfs.c @@ -67,7 +67,6 @@ static unifyfs_resource_t resource; static struct option const long_opts[] = { { "cleanup", no_argument, NULL, 'c' }, - { "consistency", required_argument, NULL, 'C' }, { "debug", no_argument, NULL, 'd' }, { "exe", required_argument, NULL, 'e' }, { "help", no_argument, NULL, 'h' }, @@ -97,7 +96,6 @@ static char* usage_str = " -h, --help print usage\n" "\n" "Command options for \"start\":\n" - " -C, --consistency= [OPTIONAL] consistency model (NONE | LAMINATED | POSIX)\n" " -e, --exe= [OPTIONAL] where unifyfsd is installed\n" " -m, --mount= [OPTIONAL] mount UnifyFS at \n" " -s, --script= [OPTIONAL] to custom launch script\n" @@ -132,7 +130,6 @@ static void parse_cmd_arguments(int argc, char** argv) int timeout = UNIFYFS_DEFAULT_INIT_TIMEOUT; int stage_parallel = 0; int stage_timeout = -1; - unifyfs_cm_e consistency = UNIFYFS_CM_LAMINATED; char* mountpoint = NULL; char* script = NULL; char* share_dir = NULL; @@ -150,13 +147,6 @@ static void parse_cmd_arguments(int argc, char** argv) cleanup = 1; break; - case 'C': - consistency = unifyfs_cm_enum_from_str(optarg); - if (consistency == UNIFYFS_CM_INVALID) { - usage(1); - } - break; - case 'd': debug = 5; break; @@ -211,7 +201,6 @@ static void parse_cmd_arguments(int argc, char** argv) cli_args.debug = debug; cli_args.cleanup = cleanup; - cli_args.consistency = consistency; cli_args.script = script; cli_args.mountpoint = mountpoint; cli_args.server_path = srvr_exe; @@ -254,8 +243,6 @@ int main(int argc, char** argv) if (debug) { printf("\n## options from the command line ##\n"); printf("cleanup:\t%d\n", cli_args.cleanup); - printf("consistency:\t%s\n", - unifyfs_cm_enum_str(cli_args.consistency)); printf("debug:\t%d\n", cli_args.debug); printf("mountpoint:\t%s\n", cli_args.mountpoint); printf("script:\t%s\n", cli_args.script); diff --git a/util/unifyfs/src/unifyfs.h b/util/unifyfs/src/unifyfs.h index f986ce5e0..43d0495c6 100644 --- a/util/unifyfs/src/unifyfs.h +++ b/util/unifyfs/src/unifyfs.h @@ -46,7 +46,6 @@ #include #include -#include "cm_enumerator.h" #include "rm_enumerator.h" #include "unifyfs_const.h" @@ -58,7 +57,6 @@ struct _unifyfs_args { int debug; /* enable debug output */ int cleanup; /* cleanup on termination? (0 or 1) */ int timeout; /* timeout of server initialization */ - unifyfs_cm_e consistency; /* consistency model */ char* mountpoint; /* mountpoint */ char* server_path; /* full path to installed unifyfsd */ char* share_dir; /* full path to shared file system directory */ From 2daa92f838a54df8e3b33ae5e1dedfe425fa4b2e Mon Sep 17 00:00:00 2001 From: CamStan Date: Thu, 25 Aug 2022 13:19:29 -0700 Subject: [PATCH 81/81] Run tests on main branch --- .github/workflows/build-and-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 277f6fa4f..884e5536a 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -2,7 +2,7 @@ name: UnifyFS Build and Test on: pull_request: - branches: [ dev ] + branches: [ main, dev ] push: jobs: