From 1518ea8e3083fda41dd175ea4b18385e813656d8 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Thu, 26 Sep 2024 14:59:30 +0100 Subject: [PATCH 01/26] DAOS-15682 dfuse: Perform reads in larger chunks. (#14212) When dfuse sees I/O as well-aligned 128k reads then read MB at a time and cache the result allowing for faster read bandwidth for well behaved applicaions and large files. Create a new in-memory descriptor for file contents, pull in a whole descriptor on first read and perform all other reads from the same result. This should give much higher bandwidth for well behaved applications and should be easy to extend to proper readahead in future. Signed-off-by: Ashley Pittman --- src/client/dfuse/dfuse.h | 10 ++ src/client/dfuse/ops/open.c | 7 +- src/client/dfuse/ops/read.c | 339 +++++++++++++++++++++++++++++++++++- 3 files changed, 348 insertions(+), 8 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 58847287391..e3b3c0d7d0e 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -405,6 +405,7 @@ struct dfuse_event { union { struct dfuse_obj_hdl *de_oh; struct dfuse_inode_entry *de_ie; + struct read_chunk_data *de_cd; }; off_t de_req_position; /**< The file position requested by fuse */ union { @@ -1011,6 +1012,8 @@ struct dfuse_inode_entry { /* Entry on the evict list */ d_list_t ie_evict_entry; + + struct read_chunk_core *ie_chunk; }; /* Flush write-back cache writes to a inode. It does this by waiting for and then releasing an @@ -1108,6 +1111,13 @@ dfuse_compute_inode(struct dfuse_cont *dfs, void dfuse_cache_evict_dir(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie); +/* Free any read chunk data for an inode. + * + * Returns true if feature was used. + */ +bool +read_chunk_close(struct dfuse_inode_entry *ie); + /* Metadata caching functions. */ /* Mark the cache as up-to-date from now */ diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 46a49cf0767..772e92e1c13 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -139,6 +139,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; struct dfuse_inode_entry *ie = NULL; int rc; + uint32_t oc; uint32_t il_calls; /* Perform the opposite of what the ioctl call does, always change the open handle count @@ -206,7 +207,11 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (il_calls != 0) { atomic_fetch_sub_relaxed(&oh->doh_ie->ie_il_count, 1); } - atomic_fetch_sub_relaxed(&oh->doh_ie->ie_open_count, 1); + oc = atomic_fetch_sub_relaxed(&oh->doh_ie->ie_open_count, 1); + if (oc == 1) { + if (read_chunk_close(oh->doh_ie)) + oh->doh_linear_read = true; + } if (oh->doh_evict_on_close) { ie = oh->doh_ie; diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 1edf0a959ee..43ce9328230 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -111,6 +111,333 @@ dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_o return true; } +static struct dfuse_eq * +pick_eqt(struct dfuse_info *dfuse_info) +{ + uint64_t eqt_idx; + + eqt_idx = atomic_fetch_add_relaxed(&dfuse_info->di_eqt_idx, 1); + return &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count]; +} + +/* Chunk read and coalescing + * + * This code attempts to predict application and kernel I/O patterns and preemptively read file + * data ahead of when it's requested. + * + * For some kernels read I/O size is limited to 128k when using the page cache or 1Mb when using + * direct I/O. To get around the performance impact of them detect when well aligned 128k reads + * are received and read an entire buffers worth, then for future requests the data should already + * be in cache. + * + * This code is entered when caching is enabled and reads are correctly size/aligned and not in the + * last CHUNK_SIZE of a file. When open then the inode contains a single read_chunk_core pointer + * and this contains a list of read_chunk_data entries, one for each bucket. Buckets where all + * slots have been requested are remove from the list and closed when the last request is completed. + * + * TODO: Currently there is no code to remove partially read buckets from the list so reading + * one slot every chunk would leave the entire file contents in memory until close and mean long + * list traversal times. + */ + +#define CHUNK_SIZE (1024 * 1024) + +struct read_chunk_data { + struct dfuse_event *ev; + fuse_req_t reqs[8]; + struct dfuse_obj_hdl *ohs[8]; + d_list_t list; + uint64_t bucket; + struct dfuse_eq *eqt; + int rc; + int entered; + ATOMIC int exited; + bool exiting; + bool complete; +}; + +struct read_chunk_core { + d_list_t entries; +}; + +/* Global lock for all chunk read operations. Each inode has a struct read_chunk_core * entry + * which is checked for NULL and set whilst holding this lock. Each read_chunk_core then has + * a list of read_chunk_data and again, this lock protects all lists on all inodes. This avoids + * the need for a per-inode lock which for many files would consume considerable memory but does + * mean there is potentially some lock contention. The lock however is only held for list + * manipulation, no dfs or kernel calls are made whilst holding the lock. + */ +static pthread_mutex_t rc_lock = PTHREAD_MUTEX_INITIALIZER; + +static void +chunk_free(struct read_chunk_data *cd) +{ + d_list_del(&cd->list); + d_slab_release(cd->eqt->de_read_slab, cd->ev); + D_FREE(cd); +} + +/* Called when the last open file handle on a inode is closed. This needs to free everything which + * is complete and for anything that isn't flag it for deletion in the callback. + * + * Returns true if the feature was used. + */ +bool +read_chunk_close(struct dfuse_inode_entry *ie) +{ + struct read_chunk_data *cd, *cdn; + bool rcb = false; + + D_MUTEX_LOCK(&rc_lock); + if (!ie->ie_chunk) + goto out; + + rcb = true; + + d_list_for_each_entry_safe(cd, cdn, &ie->ie_chunk->entries, list) { + if (cd->complete) { + chunk_free(cd); + } else { + cd->exiting = true; + } + } + D_FREE(ie->ie_chunk); +out: + D_MUTEX_UNLOCK(&rc_lock); + return rcb; +} + +static void +chunk_cb(struct dfuse_event *ev) +{ + struct read_chunk_data *cd = ev->de_cd; + fuse_req_t req; + bool done = false; + + cd->rc = ev->de_ev.ev_error; + + if (cd->rc == 0 && (ev->de_len != CHUNK_SIZE)) { + cd->rc = EIO; + DS_WARN(cd->rc, "Unexpected short read bucket %ld (%#zx) expected %i got %zi", + cd->bucket, cd->bucket * CHUNK_SIZE, CHUNK_SIZE, ev->de_len); + } + + daos_event_fini(&ev->de_ev); + + do { + int i; + req = 0; + + D_MUTEX_LOCK(&rc_lock); + + if (cd->exiting) { + chunk_free(cd); + D_MUTEX_UNLOCK(&rc_lock); + return; + } + + cd->complete = true; + for (i = 0; i < 8; i++) { + if (cd->reqs[i]) { + req = cd->reqs[i]; + cd->reqs[i] = 0; + break; + } + } + + D_MUTEX_UNLOCK(&rc_lock); + + if (req) { + size_t position = (cd->bucket * CHUNK_SIZE) + (i * K128); + + if (cd->rc != 0) { + DFUSE_REPLY_ERR_RAW(cd->ohs[i], req, cd->rc); + } else { + DFUSE_TRA_DEBUG(cd->ohs[i], "%#zx-%#zx read", position, + position + K128 - 1); + DFUSE_REPLY_BUFQ(cd->ohs[i], req, ev->de_iov.iov_buf + (i * K128), + K128); + } + + if (atomic_fetch_add_relaxed(&cd->exited, 1) == 7) + done = true; + } + } while (req && !done); + + if (done) { + d_slab_release(cd->eqt->de_read_slab, cd->ev); + D_FREE(cd); + } +} + +/* Submut a read to dfs. + * + * Returns true on success. + */ +static bool +chunk_fetch(fuse_req_t req, struct dfuse_obj_hdl *oh, struct read_chunk_data *cd, int slot) +{ + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_inode_entry *ie = oh->doh_ie; + struct dfuse_event *ev; + struct dfuse_eq *eqt; + int rc; + daos_off_t position = cd->bucket * CHUNK_SIZE; + + eqt = pick_eqt(dfuse_info); + + ev = d_slab_acquire(eqt->de_read_slab); + if (ev == NULL) { + cd->rc = ENOMEM; + return false; + } + + ev->de_iov.iov_len = CHUNK_SIZE; + ev->de_req = req; + ev->de_cd = cd; + ev->de_sgl.sg_nr = 1; + ev->de_len = 0; + ev->de_complete_cb = chunk_cb; + + cd->ev = ev; + cd->eqt = eqt; + cd->reqs[slot] = req; + cd->ohs[slot] = oh; + + rc = dfs_read(ie->ie_dfs->dfs_ns, ie->ie_obj, &ev->de_sgl, position, &ev->de_len, + &ev->de_ev); + if (rc != 0) + goto err; + + /* Send a message to the async thread to wake it up and poll for events */ + sem_post(&eqt->de_sem); + + /* Now ensure there are more descriptors for the next request */ + d_slab_restock(eqt->de_read_slab); + + return true; + +err: + daos_event_fini(&ev->de_ev); + d_slab_release(eqt->de_read_slab, ev); + cd->rc = rc; + return false; +} + +/* Try and do a bulk read. + * + * Returns true if it was able to handle the read. + */ +static bool +chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) +{ + struct dfuse_inode_entry *ie = oh->doh_ie; + struct read_chunk_core *cc; + struct read_chunk_data *cd; + off_t last; + uint64_t bucket; + int slot; + bool submit = false; + bool rcb; + + if (len != K128) + return false; + + if ((position % K128) != 0) + return false; + + last = D_ALIGNUP(position + len - 1, CHUNK_SIZE); + + if (last > oh->doh_ie->ie_stat.st_size) + return false; + + bucket = D_ALIGNUP(position + len, CHUNK_SIZE); + bucket = (bucket / CHUNK_SIZE) - 1; + + slot = (position / K128) % 8; + + DFUSE_TRA_DEBUG(oh, "read bucket %#zx-%#zx last %#zx size %#zx bucket %ld slot %d", + position, position + len - 1, last, ie->ie_stat.st_size, bucket, slot); + + D_MUTEX_LOCK(&rc_lock); + if (ie->ie_chunk == NULL) { + D_ALLOC_PTR(ie->ie_chunk); + if (ie->ie_chunk == NULL) + goto err; + D_INIT_LIST_HEAD(&ie->ie_chunk->entries); + } + cc = ie->ie_chunk; + + d_list_for_each_entry(cd, &cc->entries, list) + if (cd->bucket == bucket) { + /* Remove from list to re-add again later. */ + d_list_del(&cd->list); + goto found; + } + + D_ALLOC_PTR(cd); + if (cd == NULL) + goto err; + + cd->bucket = bucket; + submit = true; + +found: + + if (++cd->entered < 8) { + /* Put on front of list for efficient searching */ + d_list_add(&cd->list, &cc->entries); + } + + D_MUTEX_UNLOCK(&rc_lock); + + if (submit) { + DFUSE_TRA_DEBUG(oh, "submit for bucket %ld[%d]", bucket, slot); + rcb = chunk_fetch(req, oh, cd, slot); + } else { + struct dfuse_event *ev = NULL; + + /* Now check if this read request is complete or not yet, if it isn't then just + * save req in the right slot however if it is then reply here. After the call to + * DFUSE_REPLY_* then no reference is held on either the open file or the inode so + * at that point they could be closed. + */ + rcb = true; + + D_MUTEX_LOCK(&rc_lock); + if (cd->complete) { + ev = cd->ev; + } else { + cd->reqs[slot] = req; + cd->ohs[slot] = oh; + } + D_MUTEX_UNLOCK(&rc_lock); + + if (ev) { + if (cd->rc != 0) { + /* Don't pass fuse an error here, rather return false and the read + * will be tried over the network. + */ + rcb = false; + } else { + DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", position, + position + K128 - 1); + DFUSE_REPLY_BUFQ(oh, req, ev->de_iov.iov_buf + (slot * K128), K128); + } + if (atomic_fetch_add_relaxed(&cd->exited, 1) == 7) { + d_slab_release(cd->eqt->de_read_slab, cd->ev); + D_FREE(cd); + } + } + } + + return rcb; + +err: + D_MUTEX_UNLOCK(&rc_lock); + return false; +} + void dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct fuse_file_info *fi) { @@ -120,7 +447,6 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct struct dfuse_eq *eqt; int rc; struct dfuse_event *ev; - uint64_t eqt_idx; DFUSE_IE_STAT_ADD(oh->doh_ie, DS_READ); @@ -157,8 +483,10 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct return; } - eqt_idx = atomic_fetch_add_relaxed(&dfuse_info->di_eqt_idx, 1); - eqt = &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count]; + if (chunk_read(req, len, position, oh)) + return; + + eqt = pick_eqt(dfuse_info); ev = d_slab_acquire(eqt->de_read_slab); if (ev == NULL) @@ -253,12 +581,9 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) struct dfuse_eq *eqt; int rc; struct dfuse_event *ev; - uint64_t eqt_idx; size_t len = oh->doh_ie->ie_stat.st_size; - eqt_idx = atomic_fetch_add_relaxed(&dfuse_info->di_eqt_idx, 1); - eqt = &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count]; - + eqt = pick_eqt(dfuse_info); ev = d_slab_acquire(eqt->de_pre_read_slab); if (ev == NULL) D_GOTO(err, rc = ENOMEM); From a34691b13e71524fc0376d33f649b691cf65e33c Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Mon, 4 Nov 2024 17:59:00 +0000 Subject: [PATCH 02/26] DAOS-16729 dfuse: Remove deprecated single-threaded option. (#15345) This only serves to add confusion at this point. Signed-off-by: Ashley Pittman --- docs/user/filesystem.md | 1 - src/client/dfuse/dfuse.h | 1 - src/client/dfuse/dfuse_main.c | 26 ++++++++------------------ src/tests/ftest/util/dfuse_utils.py | 1 - utils/node_local_test.py | 19 +++++-------------- 5 files changed, 13 insertions(+), 35 deletions(-) diff --git a/docs/user/filesystem.md b/docs/user/filesystem.md index cf9ca813dc2..937864a1bf9 100644 --- a/docs/user/filesystem.md +++ b/docs/user/filesystem.md @@ -228,7 +228,6 @@ Additionally, there are several optional command-line options: | --container= | container label or uuid to open | | --sys-name= | DAOS system name | | --foreground | run in foreground | -| --singlethreaded | run single threaded | | --thread-count= | Number of threads to use | | --multi-user | Run in multi user mode | | --read-only | Mount in read-only mode | diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index e3b3c0d7d0e..9d162810db6 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -29,7 +29,6 @@ struct dfuse_info { char *di_mountpoint; int32_t di_thread_count; uint32_t di_eq_count; - bool di_threaded; bool di_foreground; bool di_caching; bool di_multi_user; diff --git a/src/client/dfuse/dfuse_main.c b/src/client/dfuse/dfuse_main.c index d75656121a5..02db62cc4e9 100644 --- a/src/client/dfuse/dfuse_main.c +++ b/src/client/dfuse/dfuse_main.c @@ -166,6 +166,7 @@ dfuse_bg(struct dfuse_info *dfuse_info) * * Should be called from the post_start plugin callback and creates * a filesystem. + * Returns a DAOS error code. * Returns true on success, false on failure. */ int @@ -204,18 +205,17 @@ dfuse_launch_fuse(struct dfuse_info *dfuse_info, struct fuse_args *args) DFUSE_TRA_ERROR(dfuse_info, "Error sending signal to fg: "DF_RC, DP_RC(rc)); /* Blocking */ - if (dfuse_info->di_threaded) - rc = dfuse_loop(dfuse_info); - else - rc = fuse_session_loop(dfuse_info->di_session); - if (rc != 0) + rc = dfuse_loop(dfuse_info); + if (rc != 0) { DHS_ERROR(dfuse_info, rc, "Fuse loop exited"); + rc = daos_errno2der(rc); + } umount: fuse_session_unmount(dfuse_info->di_session); - return daos_errno2der(rc); + return rc; } #define DF_POOL_PREFIX "pool=" @@ -279,7 +279,6 @@ show_help(char *name) " --path= Path to load UNS pool/container data\n" " --sys-name=STR DAOS system name context for servers\n" "\n" - " -S --singlethread Single threaded (deprecated)\n" " -t --thread-count=count Total number of threads to use\n" " -e --eq-count=count Number of event queues to use\n" " -f --foreground Run in foreground\n" @@ -423,7 +422,6 @@ main(int argc, char **argv) {"pool", required_argument, 0, 'p'}, {"container", required_argument, 0, 'c'}, {"sys-name", required_argument, 0, 'G'}, - {"singlethread", no_argument, 0, 'S'}, {"thread-count", required_argument, 0, 't'}, {"eq-count", required_argument, 0, 'e'}, {"foreground", no_argument, 0, 'f'}, @@ -447,13 +445,12 @@ main(int argc, char **argv) if (dfuse_info == NULL) D_GOTO(out_debug, rc = -DER_NOMEM); - dfuse_info->di_threaded = true; dfuse_info->di_caching = true; dfuse_info->di_wb_cache = true; dfuse_info->di_eq_count = 1; while (1) { - c = getopt_long(argc, argv, "Mm:St:o:fhe:v", long_options, NULL); + c = getopt_long(argc, argv, "Mm:t:o:fhe:v", long_options, NULL); if (c == -1) break; @@ -491,13 +488,6 @@ main(int argc, char **argv) case 'P': path = optarg; break; - case 'S': - /* Set it to be single threaded, but allow an extra one - * for the event queue processing - */ - dfuse_info->di_threaded = false; - dfuse_info->di_thread_count = 2; - break; case 'e': dfuse_info->di_eq_count = atoi(optarg); break; @@ -564,7 +554,7 @@ main(int argc, char **argv) * check CPU binding. If bound to a number of cores then launch that number of threads, * if not bound them limit to 16. */ - if (dfuse_info->di_threaded && !have_thread_count) { + if (!have_thread_count) { struct hwloc_topology *hwt; hwloc_const_cpuset_t hw; int total; diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py index a26f372e76d..900da63ebf1 100644 --- a/src/tests/ftest/util/dfuse_utils.py +++ b/src/tests/ftest/util/dfuse_utils.py @@ -30,7 +30,6 @@ def __init__(self, namespace, command, path=""): self.sys_name = FormattedParameter("--sys-name {}") self.thread_count = FormattedParameter("--thread-count {}") self.eq_count = FormattedParameter("--eq-count {}") - self.singlethreaded = FormattedParameter("--singlethread", False) self.foreground = FormattedParameter("--foreground", False) self.enable_caching = FormattedParameter("--enable-caching", False) self.enable_wb_cache = FormattedParameter("--enable-wb-cache", False) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index b3f7d9a457c..623fead2c25 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -1337,7 +1337,7 @@ def __str__(self): return f'DFuse instance at {self.dir} ({running})' - def start(self, v_hint=None, single_threaded=False, use_oopt=False): + def start(self, v_hint=None, use_oopt=False): """Start a dfuse instance""" # pylint: disable=too-many-branches dfuse_bin = join(self.conf['PREFIX'], 'bin', 'dfuse') @@ -1385,9 +1385,7 @@ def start(self, v_hint=None, single_threaded=False, use_oopt=False): if self.multi_user: cmd.append('--multi-user') - if single_threaded: - cmd.append('--singlethread') - elif not self.cores: + if not self.cores: # Use a lower default thread-count for NLT due to running tests in parallel. cmd.extend(['--thread-count', '4']) @@ -1980,11 +1978,9 @@ class needs_dfuse_with_opt(): wrapping_lock = threading.Lock() # pylint: disable=too-few-public-methods - def __init__(self, caching_variants=None, wbcache=True, single_threaded=False, - dfuse_inval=True, ro=False): + def __init__(self, caching_variants=None, wbcache=True, dfuse_inval=True, ro=False): self.caching_variants = caching_variants if caching_variants else [False, True] self.wbcache = wbcache - self.single_threaded = single_threaded self.dfuse_inval = dfuse_inval self.ro = ro @@ -2020,7 +2016,7 @@ def _helper(obj): caching=caching, wbcache=self.wbcache, **args) - obj.dfuse.start(v_hint=method.__name__, single_threaded=self.single_threaded) + obj.dfuse.start(v_hint=method.__name__) try: rc = method(obj) finally: @@ -2678,11 +2674,6 @@ def test_readdir_unlink(self): assert len(post_files) == len(files) - 1 assert post_files == files[:-2] + [files[-1]] - @needs_dfuse_with_opt(single_threaded=True, caching_variants=[True]) - def test_single_threaded(self): - """Test single-threaded mode""" - self.readdir_test(10) - @needs_dfuse def test_open_replaced(self): """Test that fstat works on file clobbered by rename""" @@ -5987,7 +5978,7 @@ def test_dfuse_start(server, conf, wf): cmd = [join(conf['PREFIX'], 'bin', 'dfuse'), '--mountpoint', mount_point, - '--pool', pool.id(), '--cont', container.id(), '--foreground', '--singlethread'] + '--pool', pool.id(), '--cont', container.id(), '--foreground', '--thread-count=2'] test_cmd = AllocFailTest(conf, 'dfuse', cmd) test_cmd.wf = wf From ded7560de9fa7844592d0cf85e77617828a1029f Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Fri, 8 Nov 2024 08:54:21 +0000 Subject: [PATCH 03/26] DAOS-16736 dfuse: Add a common struct for active IE data. (#15362) Create a active_inode struct and allocate it for all inodes which have more than one open handle. This allows us to share state/caching data across open handles easier and to better support concurrent readers. Future work here will improve performance for concurrent readers when caching is used, and allow us to make the in-memory inode struct smaller which will save memory. Signed-off-by: Ashley Pittman ashley.m.pittman@intel.com --- src/client/dfuse/SConscript | 1 + src/client/dfuse/dfuse.h | 19 ++++++- src/client/dfuse/dfuse_core.c | 1 + src/client/dfuse/file.c | 94 ++++++++++++++++++++++++++++++++++ src/client/dfuse/ops/create.c | 12 +++-- src/client/dfuse/ops/lookup.c | 12 +++++ src/client/dfuse/ops/open.c | 25 +++++---- src/client/dfuse/ops/opendir.c | 14 +++-- src/client/dfuse/ops/read.c | 53 ++++++------------- utils/cq/words.dict | 1 + utils/node_local_test.py | 47 ++++++++++++++++- 11 files changed, 223 insertions(+), 56 deletions(-) create mode 100644 src/client/dfuse/file.c diff --git a/src/client/dfuse/SConscript b/src/client/dfuse/SConscript index fdd60eac11d..12ef33841d4 100644 --- a/src/client/dfuse/SConscript +++ b/src/client/dfuse/SConscript @@ -7,6 +7,7 @@ DFUSE_SRC = ['dfuse_core.c', 'dfuse_main.c', 'dfuse_fuseops.c', 'inval.c', + 'file.c', 'dfuse_cont.c', 'dfuse_thread.c', 'dfuse_pool.c'] diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 9d162810db6..9b3ece82444 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -1009,12 +1009,29 @@ struct dfuse_inode_entry { */ ATOMIC bool ie_linear_read; + struct active_inode *ie_active; + /* Entry on the evict list */ d_list_t ie_evict_entry; +}; - struct read_chunk_core *ie_chunk; +struct active_inode { + d_list_t chunks; + pthread_spinlock_t lock; }; +/* Increase active count on inode. This takes a reference and allocates ie->active as required */ +int +active_ie_init(struct dfuse_inode_entry *ie); + +/* Mark a oh as closing and drop the ref on inode active */ +bool +active_oh_decref(struct dfuse_obj_hdl *oh); + +/* Decrease active count on inode, called on error where there is no oh */ +void +active_ie_decref(struct dfuse_inode_entry *ie); + /* Flush write-back cache writes to a inode. It does this by waiting for and then releasing an * exclusive lock on the inode. Writes take a shared lock so this will block until all pending * writes are complete. diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index 0895451f9f8..e69b598efb7 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -1274,6 +1274,7 @@ dfuse_ie_close(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) atomic_load_relaxed(&ie->ie_il_count)); D_ASSERTF(atomic_load_relaxed(&ie->ie_open_count) == 0, "open_count is %d", atomic_load_relaxed(&ie->ie_open_count)); + D_ASSERT(!ie->ie_active); if (ie->ie_obj) { rc = dfs_release(ie->ie_obj); diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c new file mode 100644 index 00000000000..c06b7f7fd81 --- /dev/null +++ b/src/client/dfuse/file.c @@ -0,0 +1,94 @@ +/** + * (C) Copyright 2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ + +#include "dfuse_common.h" +#include "dfuse.h" + +/* A lock is needed here, not for ie_open_count which is updated atomcially here and elsewhere + * but to ensure that ie_active is also atomically updated with the reference count. + */ +static pthread_mutex_t alock = PTHREAD_MUTEX_INITIALIZER; + +/* Perhaps combine with dfuse_open_handle_init? */ +int +active_ie_init(struct dfuse_inode_entry *ie) +{ + uint32_t oc; + int rc = -DER_SUCCESS; + + D_MUTEX_LOCK(&alock); + + oc = atomic_fetch_add_relaxed(&ie->ie_open_count, 1); + + DFUSE_TRA_DEBUG(ie, "Addref to %d", oc + 1); + + if (oc != 0) + goto out; + + D_ALLOC_PTR(ie->ie_active); + if (!ie->ie_active) + D_GOTO(out, rc = -DER_NOMEM); + + rc = D_SPIN_INIT(&ie->ie_active->lock, 0); + if (rc != -DER_SUCCESS) { + D_FREE(ie->ie_active); + goto out; + } + D_INIT_LIST_HEAD(&ie->ie_active->chunks); +out: + D_MUTEX_UNLOCK(&alock); + return rc; +} + +static void +ah_free(struct dfuse_inode_entry *ie) +{ + D_SPIN_DESTROY(&ie->ie_active->lock); + D_FREE(ie->ie_active); +} + +bool +active_oh_decref(struct dfuse_obj_hdl *oh) +{ + uint32_t oc; + bool rcb = true; + + D_MUTEX_LOCK(&alock); + + oc = atomic_fetch_sub_relaxed(&oh->doh_ie->ie_open_count, 1); + D_ASSERTF(oc >= 1, "Invalid decref from %d on %p %p", oc, oh, oh->doh_ie); + + DFUSE_TRA_DEBUG(oh->doh_ie, "Decref to %d", oc - 1); + + if (oc != 1) + goto out; + + rcb = read_chunk_close(oh->doh_ie); + + ah_free(oh->doh_ie); +out: + D_MUTEX_UNLOCK(&alock); + return rcb; +} + +void +active_ie_decref(struct dfuse_inode_entry *ie) +{ + uint32_t oc; + D_MUTEX_LOCK(&alock); + + oc = atomic_fetch_sub_relaxed(&ie->ie_open_count, 1); + D_ASSERTF(oc >= 1, "Invalid decref from %d on %p", oc, ie); + + DFUSE_TRA_DEBUG(ie, "Decref to %d", oc - 1); + + if (oc != 1) + goto out; + + ah_free(ie); +out: + D_MUTEX_UNLOCK(&alock); +} diff --git a/src/client/dfuse/ops/create.c b/src/client/dfuse/ops/create.c index a389c253795..5549d17113b 100644 --- a/src/client/dfuse/ops/create.c +++ b/src/client/dfuse/ops/create.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -190,7 +190,7 @@ dfuse_cb_create(fuse_req_t req, struct dfuse_inode_entry *parent, const char *na /** duplicate the file handle for the fuse handle */ rc = dfs_dup(dfs->dfs_ns, oh->doh_obj, O_RDWR, &ie->ie_obj); if (rc) - D_GOTO(release, rc); + D_GOTO(drop_ie, rc); oh->doh_writeable = true; @@ -217,14 +217,18 @@ dfuse_cb_create(fuse_req_t req, struct dfuse_inode_entry *parent, const char *na dfuse_compute_inode(dfs, &ie->ie_oid, &ie->ie_stat.st_ino); - atomic_fetch_add_relaxed(&ie->ie_open_count, 1); + rc = active_ie_init(ie); + if (rc != -DER_SUCCESS) + goto drop_oh; /* Return the new inode data, and keep the parent ref */ dfuse_reply_entry(dfuse_info, ie, &fi_out, true, req); return; -release: +drop_oh: dfs_release(oh->doh_obj); +drop_ie: + dfs_release(ie->ie_obj); err: DFUSE_REPLY_ERR_RAW(parent, req, rc); dfuse_oh_free(dfuse_info, oh); diff --git a/src/client/dfuse/ops/lookup.c b/src/client/dfuse/ops/lookup.c index 8e168102cb4..90af5d9b8e1 100644 --- a/src/client/dfuse/ops/lookup.c +++ b/src/client/dfuse/ops/lookup.c @@ -88,6 +88,18 @@ dfuse_reply_entry(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, D_GOTO(out_err, rc = EIO); } + /* Make the inode active for the create case */ + if (ie->ie_active) { + D_ASSERT(atomic_load_relaxed(&ie->ie_open_count) == 1); + active_ie_decref(ie); + rc = active_ie_init(inode); + if (rc != -DER_SUCCESS) { + atomic_fetch_sub_relaxed(&ie->ie_ref, 1); + dfuse_ie_close(dfuse_info, ie); + D_GOTO(out_err, rc); + } + } + DFUSE_TRA_DEBUG(inode, "Maybe updating parent inode %#lx dfs_ino %#lx", entry.ino, ie->ie_dfs->dfs_ino); diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 772e92e1c13..447c5213b72 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -93,6 +93,10 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) fi_out.fh = (uint64_t)oh; + rc = active_ie_init(ie); + if (rc) + goto err; + /* * dfs_dup() just locally duplicates the file handle. If we have * O_TRUNC flag, we need to truncate the file manually. @@ -100,12 +104,10 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (fi->flags & O_TRUNC) { rc = dfs_punch(ie->ie_dfs->dfs_ns, ie->ie_obj, 0, DFS_MAX_FSIZE); if (rc) - D_GOTO(err, rc); + D_GOTO(decref, rc); dfuse_dcache_evict(oh->doh_ie); } - atomic_fetch_add_relaxed(&ie->ie_open_count, 1); - /* Enable this for files up to the max read size. */ if (prefetch && oh->doh_parent_dir && atomic_load_relaxed(&oh->doh_parent_dir->ie_linear_read) && ie->ie_stat.st_size > 0 && @@ -127,11 +129,17 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) dfuse_pre_read(dfuse_info, oh); return; +decref: + active_ie_decref(ie); err: dfuse_oh_free(dfuse_info, oh); DFUSE_REPLY_ERR_RAW(ie, req, rc); } +/* Release a file handle, called after close() by an application. + * + * Can be invoked concurrently on the same inode. + */ void dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) { @@ -139,13 +147,14 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; struct dfuse_inode_entry *ie = NULL; int rc; - uint32_t oc; uint32_t il_calls; /* Perform the opposite of what the ioctl call does, always change the open handle count * but the inode only tracks number of open handles with non-zero ioctl counts */ + D_ASSERT(oh->doh_ie->ie_active); + DFUSE_TRA_DEBUG(oh, "Closing %d", oh->doh_caching); DFUSE_IE_WFLUSH(oh->doh_ie); @@ -207,17 +216,15 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (il_calls != 0) { atomic_fetch_sub_relaxed(&oh->doh_ie->ie_il_count, 1); } - oc = atomic_fetch_sub_relaxed(&oh->doh_ie->ie_open_count, 1); - if (oc == 1) { - if (read_chunk_close(oh->doh_ie)) - oh->doh_linear_read = true; - } if (oh->doh_evict_on_close) { ie = oh->doh_ie; atomic_fetch_add_relaxed(&ie->ie_ref, 1); } + if (active_oh_decref(oh)) + oh->doh_linear_read = true; + rc = dfs_release(oh->doh_obj); if (rc == 0) { DFUSE_REPLY_ZERO_OH(oh, req); diff --git a/src/client/dfuse/ops/opendir.c b/src/client/dfuse/ops/opendir.c index 091d4102c1c..eca61a45526 100644 --- a/src/client/dfuse/ops/opendir.c +++ b/src/client/dfuse/ops/opendir.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -19,6 +19,10 @@ dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_ if (!oh) D_GOTO(err, rc = ENOMEM); + rc = active_ie_init(ie); + if (rc != -DER_SUCCESS) + D_GOTO(free, rc = daos_der2errno(rc)); + DFUSE_TRA_UP(oh, ie, "open handle"); dfuse_open_handle_init(dfuse_info, oh, ie); @@ -35,12 +39,11 @@ dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_ fi_out.keep_cache = 1; } - atomic_fetch_add_relaxed(&ie->ie_open_count, 1); - DFUSE_REPLY_OPEN_DIR(oh, req, &fi_out); return; -err: +free: D_FREE(oh); +err: DFUSE_REPLY_ERR_RAW(ie, req, rc); } @@ -57,7 +60,8 @@ dfuse_cb_releasedir(fuse_req_t req, struct dfuse_inode_entry *ino, struct fuse_f if (atomic_load_relaxed(&oh->doh_il_calls) != 0) atomic_fetch_sub_relaxed(&oh->doh_ie->ie_il_count, 1); - atomic_fetch_sub_relaxed(&oh->doh_ie->ie_open_count, 1); + + active_oh_decref(oh); DFUSE_TRA_DEBUG(oh, "Kernel cache flags invalid %d started %d finished %d", oh->doh_kreaddir_invalid, oh->doh_kreaddir_started, diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 43ce9328230..7bef23463eb 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -144,6 +144,7 @@ pick_eqt(struct dfuse_info *dfuse_info) struct read_chunk_data { struct dfuse_event *ev; + struct active_inode *ia; fuse_req_t reqs[8]; struct dfuse_obj_hdl *ohs[8]; d_list_t list; @@ -156,19 +157,6 @@ struct read_chunk_data { bool complete; }; -struct read_chunk_core { - d_list_t entries; -}; - -/* Global lock for all chunk read operations. Each inode has a struct read_chunk_core * entry - * which is checked for NULL and set whilst holding this lock. Each read_chunk_core then has - * a list of read_chunk_data and again, this lock protects all lists on all inodes. This avoids - * the need for a per-inode lock which for many files would consume considerable memory but does - * mean there is potentially some lock contention. The lock however is only held for list - * manipulation, no dfs or kernel calls are made whilst holding the lock. - */ -static pthread_mutex_t rc_lock = PTHREAD_MUTEX_INITIALIZER; - static void chunk_free(struct read_chunk_data *cd) { @@ -188,22 +176,21 @@ read_chunk_close(struct dfuse_inode_entry *ie) struct read_chunk_data *cd, *cdn; bool rcb = false; - D_MUTEX_LOCK(&rc_lock); - if (!ie->ie_chunk) + D_SPIN_LOCK(&ie->ie_active->lock); + if (d_list_empty(&ie->ie_active->chunks)) goto out; rcb = true; - d_list_for_each_entry_safe(cd, cdn, &ie->ie_chunk->entries, list) { + d_list_for_each_entry_safe(cd, cdn, &ie->ie_active->chunks, list) { if (cd->complete) { chunk_free(cd); } else { cd->exiting = true; } } - D_FREE(ie->ie_chunk); out: - D_MUTEX_UNLOCK(&rc_lock); + D_SPIN_UNLOCK(&ie->ie_active->lock); return rcb; } @@ -211,6 +198,7 @@ static void chunk_cb(struct dfuse_event *ev) { struct read_chunk_data *cd = ev->de_cd; + struct active_inode *ia = cd->ia; fuse_req_t req; bool done = false; @@ -228,11 +216,11 @@ chunk_cb(struct dfuse_event *ev) int i; req = 0; - D_MUTEX_LOCK(&rc_lock); + D_SPIN_LOCK(&ia->lock); if (cd->exiting) { chunk_free(cd); - D_MUTEX_UNLOCK(&rc_lock); + D_SPIN_UNLOCK(&ia->lock); return; } @@ -245,7 +233,7 @@ chunk_cb(struct dfuse_event *ev) } } - D_MUTEX_UNLOCK(&rc_lock); + D_SPIN_UNLOCK(&ia->lock); if (req) { size_t position = (cd->bucket * CHUNK_SIZE) + (i * K128); @@ -332,7 +320,6 @@ static bool chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) { struct dfuse_inode_entry *ie = oh->doh_ie; - struct read_chunk_core *cc; struct read_chunk_data *cd; off_t last; uint64_t bucket; @@ -359,16 +346,9 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) DFUSE_TRA_DEBUG(oh, "read bucket %#zx-%#zx last %#zx size %#zx bucket %ld slot %d", position, position + len - 1, last, ie->ie_stat.st_size, bucket, slot); - D_MUTEX_LOCK(&rc_lock); - if (ie->ie_chunk == NULL) { - D_ALLOC_PTR(ie->ie_chunk); - if (ie->ie_chunk == NULL) - goto err; - D_INIT_LIST_HEAD(&ie->ie_chunk->entries); - } - cc = ie->ie_chunk; + D_SPIN_LOCK(&ie->ie_active->lock); - d_list_for_each_entry(cd, &cc->entries, list) + d_list_for_each_entry(cd, &ie->ie_active->chunks, list) if (cd->bucket == bucket) { /* Remove from list to re-add again later. */ d_list_del(&cd->list); @@ -379,6 +359,7 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) if (cd == NULL) goto err; + cd->ia = ie->ie_active; cd->bucket = bucket; submit = true; @@ -386,10 +367,10 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) if (++cd->entered < 8) { /* Put on front of list for efficient searching */ - d_list_add(&cd->list, &cc->entries); + d_list_add(&cd->list, &ie->ie_active->chunks); } - D_MUTEX_UNLOCK(&rc_lock); + D_SPIN_UNLOCK(&ie->ie_active->lock); if (submit) { DFUSE_TRA_DEBUG(oh, "submit for bucket %ld[%d]", bucket, slot); @@ -404,14 +385,14 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) */ rcb = true; - D_MUTEX_LOCK(&rc_lock); + D_SPIN_LOCK(&ie->ie_active->lock); if (cd->complete) { ev = cd->ev; } else { cd->reqs[slot] = req; cd->ohs[slot] = oh; } - D_MUTEX_UNLOCK(&rc_lock); + D_SPIN_UNLOCK(&ie->ie_active->lock); if (ev) { if (cd->rc != 0) { @@ -434,7 +415,7 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) return rcb; err: - D_MUTEX_UNLOCK(&rc_lock); + D_SPIN_UNLOCK(&ie->ie_active->lock); return false; } diff --git a/utils/cq/words.dict b/utils/cq/words.dict index 7a2784b61f8..102ca21a882 100644 --- a/utils/cq/words.dict +++ b/utils/cq/words.dict @@ -140,6 +140,7 @@ debian debuginfo defusedxml del +dentry deps dereference dereferencing diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 623fead2c25..f29cd3f7ed7 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2007,6 +2007,12 @@ def _helper(obj): 'dfuse-dentry-dir-time': '5m', 'dfuse-ndentry-time': '5m'} obj.container.set_attrs(cont_attrs) + elif caching: + cont_attrs = {'dfuse-attr-time': '1m', + 'dfuse-dentry-time': '1m', + 'dfuse-dentry-dir-time': '1m', + 'dfuse-ndentry-time': '1m'} + obj.container.set_attrs(cont_attrs) if self.ro: args["ro"] = True @@ -3369,7 +3375,7 @@ def test_complex_unlink(self): print(os.listdir(self.dfuse.dir)) print(os.listdir(dfuse.dir)) - # Rename file 0 to file 0 in the background, this will remove file 1 + # Rename file 0 to file 1 in the background, this will remove the old file 1 os.rename(join(dfuse.dir, 'file.0'), join(dfuse.dir, 'file.1')) # Perform the unlink, this will unlink the other file. @@ -3386,6 +3392,45 @@ def test_complex_unlink(self): for fd in fds: fd.close() + @needs_dfuse_with_opt(caching_variants=[False]) + def test_create_exists(self): + """Test creating a file. + + This tests for create where the dentry being created already exists and is a file that's + known to dfuse. + + To do this make a file in dfuse, use a back channel to rename it and then create a file + using the new name.""" + + filename = join(self.dfuse.dir, 'myfile') + + with open(filename, 'w') as fd: + fd.write('hello') + + filename = join(self.dfuse.dir, 'newfile') + try: + os.stat(filename) + raise NLTestFail("File exists") + except FileNotFoundError: + pass + + # Start another dfuse instance to move the files around without the kernel knowing. + dfuse = DFuse(self.server, + self.conf, + container=self.container, + caching=False) + dfuse.start(v_hint='create_exists_1') + + os.rename(join(dfuse.dir, 'myfile'), join(dfuse.dir, 'newfile')) + + filename = join(self.dfuse.dir, 'newfile') + + with open(filename, 'w') as fd: + fd.write('hello') + + if dfuse.stop(): + self.fatal_errors = True + def test_cont_rw(self): """Test write access to another users container""" dfuse = DFuse(self.server, From 6abf25a3432876d995951e276a27689e10f5e0a3 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Fri, 15 Nov 2024 16:22:52 +0000 Subject: [PATCH 04/26] DAOS-16686 dfuse: Move pre-read code to inode from file handle. (#15488) Attach pre-read buffers to the inode rather than open file handle. This code was written with the idea that a client would open and read the file and that would populate the kernel cache however in practice there are often concurrent readers for the same file and what was happening was that the first-to-open was doing the pre-read but this was often not the first process to perform a read and furthermore often there would be multiple reads for the same regions, all of which would hit the network. Use the "active" entry on the inode to launch a pre-read on first open and have any request on any file handle use the pre-read buffer for replies if possible. In addition, when a read is to be serviced from the pre-read buffer but the data is not yet in memory rather than spinning on a lock consuming a fuse thread add a descriptor to a callback list so that the thread is released and the reply is made sooner when the data is available. This greatly reduces the number of duplicate network round-trip reads for workloads where multiple clients are trying to fetch the same data, something that we see a lot in some applications. Signed-off-by: Ashley Pittman ashley.m.pittman@intel.com --- src/client/dfuse/dfuse.h | 17 ++-- src/client/dfuse/file.c | 48 +++++++-- src/client/dfuse/ops/create.c | 2 +- src/client/dfuse/ops/lookup.c | 4 +- src/client/dfuse/ops/open.c | 46 ++------- src/client/dfuse/ops/opendir.c | 4 +- src/client/dfuse/ops/read.c | 178 ++++++++++++++++++++------------- 7 files changed, 176 insertions(+), 123 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 9b3ece82444..8e974c4c4ee 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -136,9 +136,10 @@ struct dfuse_inode_entry; * when EOF is returned to the kernel. If it's still present on release then it's freed then. */ struct dfuse_pre_read { - pthread_mutex_t dra_lock; + d_list_t req_list; struct dfuse_event *dra_ev; int dra_rc; + bool complete; }; /** what is returned as the handle for fuse fuse_file_info on create/open/opendir */ @@ -148,8 +149,6 @@ struct dfuse_obj_hdl { /** the DFS object handle. Not created for directories. */ dfs_obj_t *doh_obj; - struct dfuse_pre_read *doh_readahead; - /** the inode entry for the file */ struct dfuse_inode_entry *doh_ie; @@ -406,6 +405,7 @@ struct dfuse_event { struct dfuse_inode_entry *de_ie; struct read_chunk_data *de_cd; }; + struct dfuse_info *de_di; off_t de_req_position; /**< The file position requested by fuse */ union { size_t de_req_len; @@ -1016,21 +1016,22 @@ struct dfuse_inode_entry { }; struct active_inode { - d_list_t chunks; - pthread_spinlock_t lock; + d_list_t chunks; + pthread_spinlock_t lock; + struct dfuse_pre_read *readahead; }; /* Increase active count on inode. This takes a reference and allocates ie->active as required */ int -active_ie_init(struct dfuse_inode_entry *ie); +active_ie_init(struct dfuse_inode_entry *ie, bool *preread); /* Mark a oh as closing and drop the ref on inode active */ bool -active_oh_decref(struct dfuse_obj_hdl *oh); +active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh); /* Decrease active count on inode, called on error where there is no oh */ void -active_ie_decref(struct dfuse_inode_entry *ie); +active_ie_decref(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie); /* Flush write-back cache writes to a inode. It does this by waiting for and then releasing an * exclusive lock on the inode. Writes take a shared lock so this will block until all pending diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index c06b7f7fd81..f18fbde0c56 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -14,7 +14,7 @@ static pthread_mutex_t alock = PTHREAD_MUTEX_INITIALIZER; /* Perhaps combine with dfuse_open_handle_init? */ int -active_ie_init(struct dfuse_inode_entry *ie) +active_ie_init(struct dfuse_inode_entry *ie, bool *preread) { uint32_t oc; int rc = -DER_SUCCESS; @@ -25,8 +25,11 @@ active_ie_init(struct dfuse_inode_entry *ie) DFUSE_TRA_DEBUG(ie, "Addref to %d", oc + 1); - if (oc != 0) + if (oc != 0) { + if (preread && *preread) + *preread = false; goto out; + } D_ALLOC_PTR(ie->ie_active); if (!ie->ie_active) @@ -38,20 +41,47 @@ active_ie_init(struct dfuse_inode_entry *ie) goto out; } D_INIT_LIST_HEAD(&ie->ie_active->chunks); + if (preread && *preread) { + D_ALLOC_PTR(ie->ie_active->readahead); + if (ie->ie_active->readahead) { + D_INIT_LIST_HEAD(&ie->ie_active->readahead->req_list); + atomic_fetch_add_relaxed(&ie->ie_open_count, 1); + } + } + /* Take a reference on the inode to prevent it being released */ + atomic_fetch_add_relaxed(&ie->ie_ref, 1); out: D_MUTEX_UNLOCK(&alock); return rc; } static void -ah_free(struct dfuse_inode_entry *ie) +ah_free(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) { - D_SPIN_DESTROY(&ie->ie_active->lock); + struct active_inode *active = ie->ie_active; + + if (active->readahead) { + struct dfuse_event *ev; + + D_ASSERT(active->readahead->complete); + D_ASSERT(d_list_empty(&active->readahead->req_list)); + + ev = active->readahead->dra_ev; + + if (ev) { + daos_event_fini(&ev->de_ev); + d_slab_release(ev->de_eqt->de_pre_read_slab, ev); + } + D_FREE(active->readahead); + } + + D_SPIN_DESTROY(&active->lock); D_FREE(ie->ie_active); + dfuse_inode_decref(dfuse_info, ie); } bool -active_oh_decref(struct dfuse_obj_hdl *oh) +active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) { uint32_t oc; bool rcb = true; @@ -68,14 +98,14 @@ active_oh_decref(struct dfuse_obj_hdl *oh) rcb = read_chunk_close(oh->doh_ie); - ah_free(oh->doh_ie); + ah_free(dfuse_info, oh->doh_ie); out: D_MUTEX_UNLOCK(&alock); return rcb; } void -active_ie_decref(struct dfuse_inode_entry *ie) +active_ie_decref(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) { uint32_t oc; D_MUTEX_LOCK(&alock); @@ -88,7 +118,9 @@ active_ie_decref(struct dfuse_inode_entry *ie) if (oc != 1) goto out; - ah_free(ie); + read_chunk_close(ie); + + ah_free(dfuse_info, ie); out: D_MUTEX_UNLOCK(&alock); } diff --git a/src/client/dfuse/ops/create.c b/src/client/dfuse/ops/create.c index 5549d17113b..d42315d8dd5 100644 --- a/src/client/dfuse/ops/create.c +++ b/src/client/dfuse/ops/create.c @@ -217,7 +217,7 @@ dfuse_cb_create(fuse_req_t req, struct dfuse_inode_entry *parent, const char *na dfuse_compute_inode(dfs, &ie->ie_oid, &ie->ie_stat.st_ino); - rc = active_ie_init(ie); + rc = active_ie_init(ie, NULL); if (rc != -DER_SUCCESS) goto drop_oh; diff --git a/src/client/dfuse/ops/lookup.c b/src/client/dfuse/ops/lookup.c index 90af5d9b8e1..77bcc9bb0da 100644 --- a/src/client/dfuse/ops/lookup.c +++ b/src/client/dfuse/ops/lookup.c @@ -91,8 +91,8 @@ dfuse_reply_entry(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, /* Make the inode active for the create case */ if (ie->ie_active) { D_ASSERT(atomic_load_relaxed(&ie->ie_open_count) == 1); - active_ie_decref(ie); - rc = active_ie_init(inode); + active_ie_decref(dfuse_info, ie); + rc = active_ie_init(inode, NULL); if (rc != -DER_SUCCESS) { atomic_fetch_sub_relaxed(&ie->ie_ref, 1); dfuse_ie_close(dfuse_info, ie); diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 447c5213b72..2cb6a12a2bf 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -67,6 +67,7 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) * which pre-existed in the container. */ + /* TODO: This probably wants reflowing to not reference ie_open_count */ if (atomic_load_relaxed(&ie->ie_open_count) > 0 || ((ie->ie_dcache_last_update.tv_sec != 0) && dfuse_dcache_get_valid(ie, ie->ie_dfs->dfc_data_timeout))) { @@ -93,7 +94,14 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) fi_out.fh = (uint64_t)oh; - rc = active_ie_init(ie); + /* Pre-read this for files up to the max read size. */ + if (prefetch && oh->doh_parent_dir && + atomic_load_relaxed(&oh->doh_parent_dir->ie_linear_read) && ie->ie_stat.st_size > 0 && + ie->ie_stat.st_size <= DFUSE_MAX_PRE_READ) { + preread = true; + } + + rc = active_ie_init(ie, &preread); if (rc) goto err; @@ -108,18 +116,6 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) dfuse_dcache_evict(oh->doh_ie); } - /* Enable this for files up to the max read size. */ - if (prefetch && oh->doh_parent_dir && - atomic_load_relaxed(&oh->doh_parent_dir->ie_linear_read) && ie->ie_stat.st_size > 0 && - ie->ie_stat.st_size <= DFUSE_MAX_PRE_READ) { - D_ALLOC_PTR(oh->doh_readahead); - if (oh->doh_readahead) { - D_MUTEX_INIT(&oh->doh_readahead->dra_lock, 0); - D_MUTEX_LOCK(&oh->doh_readahead->dra_lock); - preread = true; - } - } - DFUSE_REPLY_OPEN(oh, req, &fi_out); /* No reference is held on oh here but if preread is true then a lock is held which prevents @@ -130,7 +126,7 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) return; decref: - active_ie_decref(ie); + active_ie_decref(dfuse_info, ie); err: dfuse_oh_free(dfuse_info, oh); DFUSE_REPLY_ERR_RAW(ie, req, rc); @@ -159,26 +155,6 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) DFUSE_IE_WFLUSH(oh->doh_ie); - if (oh->doh_readahead) { - struct dfuse_event *ev; - - /* Grab this lock first to ensure that the read cb has been completed. The - * callback might register an error and release ev so do not read it's value - * until after this has completed. - */ - D_MUTEX_LOCK(&oh->doh_readahead->dra_lock); - D_MUTEX_UNLOCK(&oh->doh_readahead->dra_lock); - - ev = oh->doh_readahead->dra_ev; - - D_MUTEX_DESTROY(&oh->doh_readahead->dra_lock); - if (ev) { - daos_event_fini(&ev->de_ev); - d_slab_release(ev->de_eqt->de_pre_read_slab, ev); - } - D_FREE(oh->doh_readahead); - } - /* If the file was read from then set the data cache time for future use, however if the * file was written to then evict the metadata cache. * The problem here is that if the file was written to then the contents will be in the @@ -222,7 +198,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) atomic_fetch_add_relaxed(&ie->ie_ref, 1); } - if (active_oh_decref(oh)) + if (active_oh_decref(dfuse_info, oh)) oh->doh_linear_read = true; rc = dfs_release(oh->doh_obj); diff --git a/src/client/dfuse/ops/opendir.c b/src/client/dfuse/ops/opendir.c index eca61a45526..60d1e6ab4d5 100644 --- a/src/client/dfuse/ops/opendir.c +++ b/src/client/dfuse/ops/opendir.c @@ -19,7 +19,7 @@ dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_ if (!oh) D_GOTO(err, rc = ENOMEM); - rc = active_ie_init(ie); + rc = active_ie_init(ie, NULL); if (rc != -DER_SUCCESS) D_GOTO(free, rc = daos_der2errno(rc)); @@ -61,7 +61,7 @@ dfuse_cb_releasedir(fuse_req_t req, struct dfuse_inode_entry *ino, struct fuse_f if (atomic_load_relaxed(&oh->doh_il_calls) != 0) atomic_fetch_sub_relaxed(&oh->doh_ie->ie_il_count, 1); - active_oh_decref(oh); + active_oh_decref(dfuse_info, oh); DFUSE_TRA_DEBUG(oh, "Kernel cache flags invalid %d started %d finished %d", oh->doh_kreaddir_invalid, oh->doh_kreaddir_started, diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 7bef23463eb..7dd0898101c 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -54,17 +54,76 @@ dfuse_cb_read_complete(struct dfuse_event *ev) #define K128 (1024 * 128) +struct read_req { + d_list_t list; + fuse_req_t req; + size_t len; + off_t position; + struct dfuse_obj_hdl *oh; +}; + +static void +readahead_actual_reply(struct active_inode *active, struct read_req *rr) +{ + size_t reply_len; + + if (rr->position + rr->len >= active->readahead->dra_ev->de_readahead_len) { + rr->oh->doh_linear_read_eof = true; + } + + /* At this point there is a buffer of known length that contains the data, and a read + * request. + * If the attempted read is bigger than the data then it will be truncated. + * It the attempted read is smaller than the buffer it will be met in full. + */ + + if (rr->position + rr->len < active->readahead->dra_ev->de_readahead_len) { + reply_len = rr->len; + DFUSE_TRA_DEBUG(rr->oh, "%#zx-%#zx read", rr->position, + rr->position + reply_len - 1); + } else { + /* The read will be truncated */ + reply_len = active->readahead->dra_ev->de_readahead_len - rr->position; + DFUSE_TRA_DEBUG(rr->oh, "%#zx-%#zx read %#zx-%#zx not read (truncated)", + rr->position, rr->position + reply_len - 1, + rr->position + reply_len, rr->position + rr->len - 1); + } + + DFUSE_IE_STAT_ADD(rr->oh->doh_ie, DS_PRE_READ); + DFUSE_REPLY_BUFQ(rr->oh, rr->req, active->readahead->dra_ev->de_iov.iov_buf + rr->position, + reply_len); +} + static bool dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) { - size_t reply_len; + struct active_inode *active = oh->doh_ie->ie_active; - if (oh->doh_readahead->dra_rc) { - DFUSE_REPLY_ERR_RAW(oh, req, oh->doh_readahead->dra_rc); + D_SPIN_LOCK(&active->lock); + if (!active->readahead->complete) { + struct read_req *rr; + + D_ALLOC_PTR(rr); + if (!rr) { + D_SPIN_UNLOCK(&active->lock); + return false; + } + rr->req = req; + rr->len = len; + rr->position = position; + rr->oh = oh; + d_list_add_tail(&rr->list, &active->readahead->req_list); + D_SPIN_UNLOCK(&active->lock); return true; } + D_SPIN_UNLOCK(&active->lock); - if (!oh->doh_linear_read || oh->doh_readahead->dra_ev == NULL) { + if (active->readahead->dra_rc) { + DFUSE_REPLY_ERR_RAW(oh, req, active->readahead->dra_rc); + return true; + } + + if (!oh->doh_linear_read || active->readahead->dra_ev == NULL) { DFUSE_TRA_DEBUG(oh, "Pre read disabled"); return false; } @@ -77,37 +136,19 @@ dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_o oh->doh_linear_read_pos = max(oh->doh_linear_read_pos, position + len); } else if (oh->doh_linear_read_pos != position) { DFUSE_TRA_DEBUG(oh, "disabling pre read"); - daos_event_fini(&oh->doh_readahead->dra_ev->de_ev); - d_slab_release(oh->doh_readahead->dra_ev->de_eqt->de_pre_read_slab, - oh->doh_readahead->dra_ev); - oh->doh_readahead->dra_ev = NULL; return false; } else { oh->doh_linear_read_pos = position + len; } - if (position + len >= oh->doh_readahead->dra_ev->de_readahead_len) { - oh->doh_linear_read_eof = true; - } + struct read_req rr; - /* At this point there is a buffer of known length that contains the data, and a read - * request. - * If the attempted read is bigger than the data then it will be truncated. - * It the attempted read is smaller than the buffer it will be met in full. - */ - - if (position + len < oh->doh_readahead->dra_ev->de_readahead_len) { - reply_len = len; - DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", position, position + reply_len - 1); - } else { - /* The read will be truncated */ - reply_len = oh->doh_readahead->dra_ev->de_readahead_len - position; - DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read %#zx-%#zx not read (truncated)", position, - position + reply_len - 1, position + reply_len, position + len - 1); - } + rr.req = req; + rr.len = len; + rr.position = position; + rr.oh = oh; - DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); - DFUSE_REPLY_BUFQ(oh, req, oh->doh_readahead->dra_ev->de_iov.iov_buf + position, reply_len); + readahead_actual_reply(active, &rr); return true; } @@ -423,6 +464,7 @@ void dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct fuse_file_info *fi) { struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; + struct active_inode *active = oh->doh_ie->ie_active; struct dfuse_info *dfuse_info = fuse_req_userdata(req); bool mock_read = false; struct dfuse_eq *eqt; @@ -436,33 +478,14 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct oh->doh_linear_read_eof = false; oh->doh_linear_read = false; - if (oh->doh_readahead) { - D_MUTEX_LOCK(&oh->doh_readahead->dra_lock); - ev = oh->doh_readahead->dra_ev; - - oh->doh_readahead->dra_ev = NULL; - D_MUTEX_UNLOCK(&oh->doh_readahead->dra_lock); - - if (ev) { - daos_event_fini(&ev->de_ev); - d_slab_release(ev->de_eqt->de_pre_read_slab, ev); - DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); - } - } + if (active->readahead) + DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); DFUSE_REPLY_BUFQ(oh, req, NULL, 0); return; } - if (oh->doh_readahead) { - bool replied; - - D_MUTEX_LOCK(&oh->doh_readahead->dra_lock); - replied = dfuse_readahead_reply(req, len, position, oh); - D_MUTEX_UNLOCK(&oh->doh_readahead->dra_lock); - - if (replied) - return; - } + if (active->readahead && dfuse_readahead_reply(req, len, position, oh)) + return; if (chunk_read(req, len, position, oh)) return; @@ -529,18 +552,37 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct } } +static void +pre_read_mark_done(struct active_inode *active) +{ + struct read_req *rr, *rrn; + + D_SPIN_LOCK(&active->lock); + active->readahead->complete = true; + D_SPIN_UNLOCK(&active->lock); + + /* No lock is held here as after complete is set then nothing further is added */ + d_list_for_each_entry_safe(rr, rrn, &active->readahead->req_list, list) { + d_list_del(&rr->list); + readahead_actual_reply(active, rr); + D_FREE(rr); + } +} + static void dfuse_cb_pre_read_complete(struct dfuse_event *ev) { - struct dfuse_obj_hdl *oh = ev->de_oh; + struct dfuse_info *dfuse_info = ev->de_di; + struct dfuse_inode_entry *ie = ev->de_ie; + struct active_inode *active = ie->ie_active; - oh->doh_readahead->dra_rc = ev->de_ev.ev_error; + active->readahead->dra_rc = ev->de_ev.ev_error; if (ev->de_ev.ev_error != 0) { - oh->doh_readahead->dra_rc = ev->de_ev.ev_error; + active->readahead->dra_rc = ev->de_ev.ev_error; daos_event_fini(&ev->de_ev); d_slab_release(ev->de_eqt->de_pre_read_slab, ev); - oh->doh_readahead->dra_ev = NULL; + active->readahead->dra_ev = NULL; } /* If the length is not as expected then the file has been modified since the last stat so @@ -550,15 +592,17 @@ dfuse_cb_pre_read_complete(struct dfuse_event *ev) if (ev->de_len != ev->de_readahead_len) { daos_event_fini(&ev->de_ev); d_slab_release(ev->de_eqt->de_pre_read_slab, ev); - oh->doh_readahead->dra_ev = NULL; + active->readahead->dra_ev = NULL; } - - D_MUTEX_UNLOCK(&oh->doh_readahead->dra_lock); + pre_read_mark_done(active); + /* Drop the extra ref on active, the file could be closed before this read completes */ + active_ie_decref(dfuse_info, ie); } void dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) { + struct active_inode *active = oh->doh_ie->ie_active; struct dfuse_eq *eqt; int rc; struct dfuse_event *ev; @@ -572,18 +616,17 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) ev->de_iov.iov_len = len; ev->de_req = 0; ev->de_sgl.sg_nr = 1; - ev->de_oh = oh; + ev->de_ie = oh->doh_ie; ev->de_readahead_len = len; ev->de_req_position = 0; + ev->de_di = dfuse_info; ev->de_complete_cb = dfuse_cb_pre_read_complete; - oh->doh_readahead->dra_ev = ev; + active->readahead->dra_ev = ev; rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, 0, &ev->de_len, &ev->de_ev); - if (rc != 0) { - D_GOTO(err, rc); - return; - } + if (rc != 0) + goto err; /* Send a message to the async thread to wake it up and poll for events */ sem_post(&eqt->de_sem); @@ -593,11 +636,12 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) return; err: - oh->doh_readahead->dra_rc = rc; + active->readahead->dra_rc = rc; if (ev) { daos_event_fini(&ev->de_ev); d_slab_release(eqt->de_pre_read_slab, ev); - oh->doh_readahead->dra_ev = NULL; + active->readahead->dra_ev = NULL; } - D_MUTEX_UNLOCK(&oh->doh_readahead->dra_lock); + active_ie_decref(dfuse_info, oh->doh_ie); + pre_read_mark_done(active); } From e644113eed2f2ddfbaf008e63d75c60d4f912d2d Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 19:31:46 +0000 Subject: [PATCH 05/26] DAOS-16686 dfuse: Fix overlapping chunk reads From https://github.com/daos-stack/daos/pull/15298/ Handle concurrent read in the chunk_read code. Rather than assuming each slot only gets requested once save the slot number as part of the request and handle multiple requests. This corrects the behaviour and avoids a crash when multiple readers read the same file concurrently and improves the performance in this case. Required-githooks: true Signed-off-by: Ashley Pittman --- src/client/dfuse/dfuse.h | 2 +- src/client/dfuse/file.c | 4 +- src/client/dfuse/ops/read.c | 318 +++++++++++++++++------------------- 3 files changed, 156 insertions(+), 168 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 8e974c4c4ee..6221002e3e0 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -1133,7 +1133,7 @@ dfuse_cache_evict_dir(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *i * Returns true if feature was used. */ bool -read_chunk_close(struct dfuse_inode_entry *ie); +read_chunk_close(struct active_inode *active); /* Metadata caching functions. */ diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index f18fbde0c56..6a86134d628 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -96,7 +96,7 @@ active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) if (oc != 1) goto out; - rcb = read_chunk_close(oh->doh_ie); + rcb = read_chunk_close(oh->doh_ie->ie_active); ah_free(dfuse_info, oh->doh_ie); out: @@ -118,7 +118,7 @@ active_ie_decref(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) if (oc != 1) goto out; - read_chunk_close(ie); + read_chunk_close(ie->ie_active); ah_free(dfuse_info, ie); out: diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 7dd0898101c..95ba4586e00 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -129,7 +129,7 @@ dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_o } if (((position % K128) == 0) && ((len % K128) == 0)) { - DFUSE_TRA_INFO(oh, "allowing out-of-order pre read"); + DFUSE_TRA_DEBUG(oh, "allowing out-of-order pre read"); /* Do not closely track the read position in this case, just the maximum, * later checks will determine if the file is read to the end. */ @@ -173,38 +173,33 @@ pick_eqt(struct dfuse_info *dfuse_info) * * This code is entered when caching is enabled and reads are correctly size/aligned and not in the * last CHUNK_SIZE of a file. When open then the inode contains a single read_chunk_core pointer - * and this contains a list of read_chunk_data entries, one for each bucket. Buckets where all - * slots have been requested are remove from the list and closed when the last request is completed. + * and this contains a list of read_chunk_data entries, one for each bucket. * - * TODO: Currently there is no code to remove partially read buckets from the list so reading - * one slot every chunk would leave the entire file contents in memory until close and mean long - * list traversal times. + * TODO: Currently there is no code to remove buckets from the list so all buckets will remain in + * memory until close. */ #define CHUNK_SIZE (1024 * 1024) struct read_chunk_data { struct dfuse_event *ev; + bool slot_done[8]; struct active_inode *ia; - fuse_req_t reqs[8]; - struct dfuse_obj_hdl *ohs[8]; d_list_t list; uint64_t bucket; struct dfuse_eq *eqt; int rc; int entered; - ATOMIC int exited; - bool exiting; bool complete; + d_list_t req_list; }; -static void -chunk_free(struct read_chunk_data *cd) -{ - d_list_del(&cd->list); - d_slab_release(cd->eqt->de_read_slab, cd->ev); - D_FREE(cd); -} +struct read_chunk_req { + d_list_t req_list; + struct dfuse_obj_hdl *oh; + fuse_req_t req; + int slot; +}; /* Called when the last open file handle on a inode is closed. This needs to free everything which * is complete and for anything that isn't flag it for deletion in the callback. @@ -212,27 +207,20 @@ chunk_free(struct read_chunk_data *cd) * Returns true if the feature was used. */ bool -read_chunk_close(struct dfuse_inode_entry *ie) +read_chunk_close(struct active_inode *active) { struct read_chunk_data *cd, *cdn; - bool rcb = false; - D_SPIN_LOCK(&ie->ie_active->lock); - if (d_list_empty(&ie->ie_active->chunks)) - goto out; - - rcb = true; + if (d_list_empty(&active->chunks)) + return false; - d_list_for_each_entry_safe(cd, cdn, &ie->ie_active->chunks, list) { - if (cd->complete) { - chunk_free(cd); - } else { - cd->exiting = true; - } + d_list_for_each_entry_safe(cd, cdn, &active->chunks, list) { + D_ASSERT(cd->complete); + d_list_del(&cd->list); + d_slab_release(cd->eqt->de_read_slab, cd->ev); + D_FREE(cd); } -out: - D_SPIN_UNLOCK(&ie->ie_active->lock); - return rcb; + return true; } static void @@ -240,119 +228,57 @@ chunk_cb(struct dfuse_event *ev) { struct read_chunk_data *cd = ev->de_cd; struct active_inode *ia = cd->ia; - fuse_req_t req; - bool done = false; + struct read_chunk_req *cr; + struct read_chunk_req *crn; cd->rc = ev->de_ev.ev_error; if (cd->rc == 0 && (ev->de_len != CHUNK_SIZE)) { - cd->rc = EIO; DS_WARN(cd->rc, "Unexpected short read bucket %ld (%#zx) expected %i got %zi", cd->bucket, cd->bucket * CHUNK_SIZE, CHUNK_SIZE, ev->de_len); } daos_event_fini(&ev->de_ev); - do { - int i; - req = 0; + /* Mark as complete so no more get put on list */ + D_SPIN_LOCK(&ia->lock); + cd->complete = true; - D_SPIN_LOCK(&ia->lock); - - if (cd->exiting) { - chunk_free(cd); - D_SPIN_UNLOCK(&ia->lock); - return; - } + /* Mark the slot as replied to. There's a race here as the slot hasn't been replied to + * however references are dropped by the DFUSE_REPLY macros below so an extra ref on active + * would be required. The danger is that the bucket gets put on the end of the list rather + * than the start. + */ + d_list_for_each_entry(cr, &cd->req_list, req_list) + cd->slot_done[cr->slot] = true; - cd->complete = true; - for (i = 0; i < 8; i++) { - if (cd->reqs[i]) { - req = cd->reqs[i]; - cd->reqs[i] = 0; - break; - } - } + D_SPIN_UNLOCK(&ia->lock); - D_SPIN_UNLOCK(&ia->lock); + d_list_for_each_entry_safe(cr, crn, &cd->req_list, req_list) { + size_t position = (cd->bucket * CHUNK_SIZE) + (cr->slot * K128); + size_t len; - if (req) { - size_t position = (cd->bucket * CHUNK_SIZE) + (i * K128); + DFUSE_TRA_DEBUG(cr->oh, "Replying for %ld[%d]", cd->bucket, cr->slot); - if (cd->rc != 0) { - DFUSE_REPLY_ERR_RAW(cd->ohs[i], req, cd->rc); - } else { - DFUSE_TRA_DEBUG(cd->ohs[i], "%#zx-%#zx read", position, - position + K128 - 1); - DFUSE_REPLY_BUFQ(cd->ohs[i], req, ev->de_iov.iov_buf + (i * K128), - K128); - } + /* Delete from the list before replying as there's no reference held otherwise */ + d_list_del(&cr->req_list); - if (atomic_fetch_add_relaxed(&cd->exited, 1) == 7) - done = true; + if (cd->rc != 0) { + DFUSE_REPLY_ERR_RAW(cr->oh, cr->req, cd->rc); + } else { + if ((((cr->slot + 1) * K128) - 1) >= ev->de_len) + len = max(ev->de_len - (cr->slot * K128), 0); + else + len = K128; + + DFUSE_TRA_DEBUG(cr->oh, "%#zx-%#zx read", position, position + len - 1); + DFUSE_REPLY_BUFQ(cr->oh, cr->req, ev->de_iov.iov_buf + (cr->slot * K128), + len); } - } while (req && !done); - - if (done) { - d_slab_release(cd->eqt->de_read_slab, cd->ev); - D_FREE(cd); + D_FREE(cr); } } -/* Submut a read to dfs. - * - * Returns true on success. - */ -static bool -chunk_fetch(fuse_req_t req, struct dfuse_obj_hdl *oh, struct read_chunk_data *cd, int slot) -{ - struct dfuse_info *dfuse_info = fuse_req_userdata(req); - struct dfuse_inode_entry *ie = oh->doh_ie; - struct dfuse_event *ev; - struct dfuse_eq *eqt; - int rc; - daos_off_t position = cd->bucket * CHUNK_SIZE; - - eqt = pick_eqt(dfuse_info); - - ev = d_slab_acquire(eqt->de_read_slab); - if (ev == NULL) { - cd->rc = ENOMEM; - return false; - } - - ev->de_iov.iov_len = CHUNK_SIZE; - ev->de_req = req; - ev->de_cd = cd; - ev->de_sgl.sg_nr = 1; - ev->de_len = 0; - ev->de_complete_cb = chunk_cb; - - cd->ev = ev; - cd->eqt = eqt; - cd->reqs[slot] = req; - cd->ohs[slot] = oh; - - rc = dfs_read(ie->ie_dfs->dfs_ns, ie->ie_obj, &ev->de_sgl, position, &ev->de_len, - &ev->de_ev); - if (rc != 0) - goto err; - - /* Send a message to the async thread to wake it up and poll for events */ - sem_post(&eqt->de_sem); - - /* Now ensure there are more descriptors for the next request */ - d_slab_restock(eqt->de_read_slab); - - return true; - -err: - daos_event_fini(&ev->de_ev); - d_slab_release(eqt->de_read_slab, ev); - cd->rc = rc; - return false; -} - /* Try and do a bulk read. * * Returns true if it was able to handle the read. @@ -362,11 +288,13 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) { struct dfuse_inode_entry *ie = oh->doh_ie; struct read_chunk_data *cd; + struct read_chunk_req *cr = NULL; off_t last; uint64_t bucket; int slot; bool submit = false; bool rcb; + bool all_done = true; if (len != K128) return false; @@ -391,7 +319,6 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) d_list_for_each_entry(cd, &ie->ie_active->chunks, list) if (cd->bucket == bucket) { - /* Remove from list to re-add again later. */ d_list_del(&cd->list); goto found; } @@ -400,57 +327,121 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) if (cd == NULL) goto err; + D_ALLOC_PTR(cr); + if (cr == NULL) { + D_FREE(cd); + goto err; + } + + D_INIT_LIST_HEAD(&cd->req_list); cd->ia = ie->ie_active; cd->bucket = bucket; submit = true; found: - if (++cd->entered < 8) { - /* Put on front of list for efficient searching */ - d_list_add(&cd->list, &ie->ie_active->chunks); + for (int i = 0; i < 8; i++) { + if (!cd->slot_done[i]) + all_done = false; } - D_SPIN_UNLOCK(&ie->ie_active->lock); + if (all_done) + d_list_add(&cd->list, &ie->ie_active->chunks); + else + d_list_add_tail(&cd->list, &ie->ie_active->chunks); if (submit) { - DFUSE_TRA_DEBUG(oh, "submit for bucket %ld[%d]", bucket, slot); - rcb = chunk_fetch(req, oh, cd, slot); - } else { - struct dfuse_event *ev = NULL; + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_eq *eqt; + struct dfuse_event *ev; + int rc; + + /* Overwrite position here to the start of the bucket */ + position = cd->bucket * CHUNK_SIZE; + + eqt = pick_eqt(dfuse_info); + + ev = d_slab_acquire(eqt->de_read_slab); + if (ev == NULL) { + d_list_del(&cd->list); + D_FREE(cr); + D_FREE(cd); + goto err; + } + + d_list_add(&cr->req_list, &cd->req_list); /* Now check if this read request is complete or not yet, if it isn't then just * save req in the right slot however if it is then reply here. After the call to * DFUSE_REPLY_* then no reference is held on either the open file or the inode so * at that point they could be closed. */ - rcb = true; - D_SPIN_LOCK(&ie->ie_active->lock); - if (cd->complete) { - ev = cd->ev; + DFUSE_TRA_DEBUG(oh, "submit for bucket %ld[%d]", bucket, slot); + D_SPIN_UNLOCK(&ie->ie_active->lock); + + cd->eqt = eqt; + cd->ev = ev; + + cr->req = req; + cr->oh = oh; + cr->slot = slot; + + ev->de_iov.iov_len = CHUNK_SIZE; + ev->de_req = req; + ev->de_cd = cd; + ev->de_sgl.sg_nr = 1; + ev->de_len = 0; + ev->de_complete_cb = chunk_cb; + + rc = dfs_read(ie->ie_dfs->dfs_ns, ie->ie_obj, &ev->de_sgl, position, &ev->de_len, + &ev->de_ev); + if (rc == 0) { + /* Send a message to the async thread to wake it up and poll for events */ + sem_post(&eqt->de_sem); } else { - cd->reqs[slot] = req; - cd->ohs[slot] = oh; + ev->de_ev.ev_error = rc; + chunk_cb(ev); } + + rcb = true; + } else if (cd->complete) { + cd->slot_done[slot] = true; + D_SPIN_UNLOCK(&ie->ie_active->lock); - if (ev) { - if (cd->rc != 0) { - /* Don't pass fuse an error here, rather return false and the read - * will be tried over the network. - */ - rcb = false; - } else { - DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", position, - position + K128 - 1); - DFUSE_REPLY_BUFQ(oh, req, ev->de_iov.iov_buf + (slot * K128), K128); - } - if (atomic_fetch_add_relaxed(&cd->exited, 1) == 7) { - d_slab_release(cd->eqt->de_read_slab, cd->ev); - D_FREE(cd); - } + if (cd->rc != 0) { + /* Don't pass fuse an error here, rather return false and + * the read will be tried over the network. + */ + rcb = false; + } else { + size_t read_len; + + if ((((slot + 1) * K128) - 1) >= cd->ev->de_len) + read_len = max(cd->ev->de_len - (slot * K128), 0); + else + read_len = K128; + + oh->doh_linear_read_pos = max(oh->doh_linear_read_pos, position + read_len); + + DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", position, position + read_len - 1); + DFUSE_REPLY_BUFQ(oh, req, cd->ev->de_iov.iov_buf + (slot * K128), read_len); + rcb = true; + } + } else { + rcb = false; + + D_ALLOC_PTR(cr); + if (cr) { + cr->req = req; + cr->oh = oh; + cr->slot = slot; + d_list_add_tail(&cr->req_list, &cd->req_list); + rcb = true; } + + D_SPIN_UNLOCK(&ie->ie_active->lock); } return rcb; @@ -493,8 +484,10 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct eqt = pick_eqt(dfuse_info); ev = d_slab_acquire(eqt->de_read_slab); - if (ev == NULL) - D_GOTO(err, rc = ENOMEM); + if (ev == NULL) { + DFUSE_REPLY_ERR_RAW(oh, req, ENOMEM); + return; + } if (oh->doh_ie->ie_truncated && position + len < oh->doh_ie->ie_stat.st_size && ((oh->doh_ie->ie_start_off == 0 && oh->doh_ie->ie_end_off == 0) || @@ -533,7 +526,8 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) { - D_GOTO(err, rc); + ev->de_ev.ev_error = rc; + dfuse_cb_read_complete(ev); return; } @@ -544,12 +538,6 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct d_slab_restock(eqt->de_read_slab); return; -err: - DFUSE_REPLY_ERR_RAW(oh, req, rc); - if (ev) { - daos_event_fini(&ev->de_ev); - d_slab_release(eqt->de_read_slab, ev); - } } static void From fd27521220dd41005428d449cbe1cfe04303431d Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 19:34:43 +0000 Subject: [PATCH 06/26] DAOS-16686 dfuse: Detect matching reads to avoid network access From https://github.com/daos-stack/daos/pull/15528 If a read matches a current outstanding read then simply connect the two and when there's a reply from the network then respond to both requests. Ashley Pittman Required-githooks: true --- src/client/dfuse/dfuse.h | 10 +- src/client/dfuse/dfuse_core.c | 2 + src/client/dfuse/file.c | 5 +- src/client/dfuse/ops/read.c | 379 +++++++++++++++++++--------------- 4 files changed, 229 insertions(+), 167 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 6221002e3e0..80ce2d7f844 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -399,6 +399,13 @@ struct dfuse_event { d_iov_t de_iov; d_sg_list_t de_sgl; d_list_t de_list; + + /* Position in a list of events, this will either be off active->open_reads or + * de->de_read_slaves. + */ + d_list_t de_read_list; + /* List of slave events */ + d_list_t de_read_slaves; struct dfuse_eq *de_eqt; union { struct dfuse_obj_hdl *de_oh; @@ -1017,6 +1024,7 @@ struct dfuse_inode_entry { struct active_inode { d_list_t chunks; + d_list_t open_reads; pthread_spinlock_t lock; struct dfuse_pre_read *readahead; }; @@ -1133,7 +1141,7 @@ dfuse_cache_evict_dir(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *i * Returns true if feature was used. */ bool -read_chunk_close(struct active_inode *active); +read_chunk_close(struct dfuse_inode_entry *ie); /* Metadata caching functions. */ diff --git a/src/client/dfuse/dfuse_core.c b/src/client/dfuse/dfuse_core.c index e69b598efb7..e0bc4ea3c3f 100644 --- a/src/client/dfuse/dfuse_core.c +++ b/src/client/dfuse/dfuse_core.c @@ -1318,6 +1318,8 @@ dfuse_read_event_size(void *arg, size_t size) ev->de_sgl.sg_nr = 1; } + D_INIT_LIST_HEAD(&ev->de_read_slaves); + rc = daos_event_init(&ev->de_ev, ev->de_eqt->de_eq, NULL); if (rc != -DER_SUCCESS) { return false; diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index 6a86134d628..028685340d7 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -41,6 +41,7 @@ active_ie_init(struct dfuse_inode_entry *ie, bool *preread) goto out; } D_INIT_LIST_HEAD(&ie->ie_active->chunks); + D_INIT_LIST_HEAD(&ie->ie_active->open_reads); if (preread && *preread) { D_ALLOC_PTR(ie->ie_active->readahead); if (ie->ie_active->readahead) { @@ -96,7 +97,7 @@ active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) if (oc != 1) goto out; - rcb = read_chunk_close(oh->doh_ie->ie_active); + rcb = read_chunk_close(oh->doh_ie); ah_free(dfuse_info, oh->doh_ie); out: @@ -118,7 +119,7 @@ active_ie_decref(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) if (oc != 1) goto out; - read_chunk_close(ie->ie_active); + read_chunk_close(ie); ah_free(dfuse_info, ie); out: diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 95ba4586e00..05f7d9bb9ef 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -8,7 +8,7 @@ #include "dfuse.h" static void -dfuse_cb_read_complete(struct dfuse_event *ev) +cb_read_helper(struct dfuse_event *ev, void *buff) { struct dfuse_obj_hdl *oh = ev->de_oh; @@ -22,32 +22,54 @@ dfuse_cb_read_complete(struct dfuse_event *ev) oh->doh_linear_read = false; } else { oh->doh_linear_read_pos = ev->de_req_position + ev->de_len; - if (ev->de_len < ev->de_req_len) { + if (ev->de_len < ev->de_req_len) oh->doh_linear_read_eof = true; - } } } - if (ev->de_len == 0) { - DFUSE_TRA_DEBUG(oh, "%#zx-%#zx requested (EOF)", ev->de_req_position, - ev->de_req_position + ev->de_req_len - 1); - - DFUSE_REPLY_BUFQ(oh, ev->de_req, ev->de_iov.iov_buf, ev->de_len); - D_GOTO(release, 0); - } - - if (ev->de_len == ev->de_req_len) + if (ev->de_len == ev->de_req_len) { DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", ev->de_req_position, ev->de_req_position + ev->de_req_len - 1); - else - DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read %#zx-%#zx not read (truncated)", - ev->de_req_position, ev->de_req_position + ev->de_len - 1, - ev->de_req_position + ev->de_len, - ev->de_req_position + ev->de_req_len - 1); + } else { + if (ev->de_len == 0) + DFUSE_TRA_DEBUG(oh, "%#zx-%#zx requested (EOF)", ev->de_req_position, + ev->de_req_position + ev->de_req_len - 1); + else + DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read %#zx-%#zx not read (truncated)", + ev->de_req_position, ev->de_req_position + ev->de_len - 1, + ev->de_req_position + ev->de_len, + ev->de_req_position + ev->de_req_len - 1); + } - DFUSE_REPLY_BUFQ(oh, ev->de_req, ev->de_iov.iov_buf, ev->de_len); + DFUSE_REPLY_BUFQ(oh, ev->de_req, buff, ev->de_len); release: daos_event_fini(&ev->de_ev); +} + +static void +dfuse_cb_read_complete(struct dfuse_event *ev) +{ + struct dfuse_event *evs, *evn; + + D_SPIN_LOCK(&ev->de_oh->doh_ie->ie_active->lock); + d_list_del(&ev->de_read_list); + D_SPIN_UNLOCK(&ev->de_oh->doh_ie->ie_active->lock); + + d_list_for_each_entry(evs, &ev->de_read_slaves, de_read_list) { + DFUSE_TRA_DEBUG(ev->de_oh, "concurrent network read %p", evs->de_oh); + evs->de_len = min(ev->de_len, evs->de_req_len); + evs->de_ev.ev_error = ev->de_ev.ev_error; + cb_read_helper(evs, ev->de_iov.iov_buf); + } + + cb_read_helper(ev, ev->de_iov.iov_buf); + + d_list_for_each_entry_safe(evs, evn, &ev->de_read_slaves, de_read_list) { + d_list_del(&evs->de_read_list); + d_slab_restock(evs->de_eqt->de_read_slab); + d_slab_release(evs->de_eqt->de_read_slab, evs); + } + d_slab_restock(ev->de_eqt->de_read_slab); d_slab_release(ev->de_eqt->de_read_slab, ev); } @@ -173,33 +195,38 @@ pick_eqt(struct dfuse_info *dfuse_info) * * This code is entered when caching is enabled and reads are correctly size/aligned and not in the * last CHUNK_SIZE of a file. When open then the inode contains a single read_chunk_core pointer - * and this contains a list of read_chunk_data entries, one for each bucket. + * and this contains a list of read_chunk_data entries, one for each bucket. Buckets where all + * slots have been requested are remove from the list and closed when the last request is completed. * - * TODO: Currently there is no code to remove buckets from the list so all buckets will remain in - * memory until close. + * TODO: Currently there is no code to remove partially read buckets from the list so reading + * one slot every chunk would leave the entire file contents in memory until close and mean long + * list traversal times. */ #define CHUNK_SIZE (1024 * 1024) struct read_chunk_data { struct dfuse_event *ev; - bool slot_done[8]; struct active_inode *ia; + fuse_req_t reqs[8]; + struct dfuse_obj_hdl *ohs[8]; d_list_t list; uint64_t bucket; struct dfuse_eq *eqt; int rc; int entered; + ATOMIC int exited; + bool exiting; bool complete; - d_list_t req_list; }; -struct read_chunk_req { - d_list_t req_list; - struct dfuse_obj_hdl *oh; - fuse_req_t req; - int slot; -}; +static void +chunk_free(struct read_chunk_data *cd) +{ + d_list_del(&cd->list); + d_slab_release(cd->eqt->de_read_slab, cd->ev); + D_FREE(cd); +} /* Called when the last open file handle on a inode is closed. This needs to free everything which * is complete and for anything that isn't flag it for deletion in the callback. @@ -207,20 +234,27 @@ struct read_chunk_req { * Returns true if the feature was used. */ bool -read_chunk_close(struct active_inode *active) +read_chunk_close(struct dfuse_inode_entry *ie) { struct read_chunk_data *cd, *cdn; + bool rcb = false; - if (d_list_empty(&active->chunks)) - return false; + D_SPIN_LOCK(&ie->ie_active->lock); + if (d_list_empty(&ie->ie_active->chunks)) + goto out; - d_list_for_each_entry_safe(cd, cdn, &active->chunks, list) { - D_ASSERT(cd->complete); - d_list_del(&cd->list); - d_slab_release(cd->eqt->de_read_slab, cd->ev); - D_FREE(cd); + rcb = true; + + d_list_for_each_entry_safe(cd, cdn, &ie->ie_active->chunks, list) { + if (cd->complete) { + chunk_free(cd); + } else { + cd->exiting = true; + } } - return true; +out: + D_SPIN_UNLOCK(&ie->ie_active->lock); + return rcb; } static void @@ -228,57 +262,119 @@ chunk_cb(struct dfuse_event *ev) { struct read_chunk_data *cd = ev->de_cd; struct active_inode *ia = cd->ia; - struct read_chunk_req *cr; - struct read_chunk_req *crn; + fuse_req_t req; + bool done = false; cd->rc = ev->de_ev.ev_error; if (cd->rc == 0 && (ev->de_len != CHUNK_SIZE)) { + cd->rc = EIO; DS_WARN(cd->rc, "Unexpected short read bucket %ld (%#zx) expected %i got %zi", cd->bucket, cd->bucket * CHUNK_SIZE, CHUNK_SIZE, ev->de_len); } daos_event_fini(&ev->de_ev); - /* Mark as complete so no more get put on list */ - D_SPIN_LOCK(&ia->lock); - cd->complete = true; + do { + int i; + req = 0; - /* Mark the slot as replied to. There's a race here as the slot hasn't been replied to - * however references are dropped by the DFUSE_REPLY macros below so an extra ref on active - * would be required. The danger is that the bucket gets put on the end of the list rather - * than the start. - */ - d_list_for_each_entry(cr, &cd->req_list, req_list) - cd->slot_done[cr->slot] = true; + D_SPIN_LOCK(&ia->lock); - D_SPIN_UNLOCK(&ia->lock); + if (cd->exiting) { + chunk_free(cd); + D_SPIN_UNLOCK(&ia->lock); + return; + } - d_list_for_each_entry_safe(cr, crn, &cd->req_list, req_list) { - size_t position = (cd->bucket * CHUNK_SIZE) + (cr->slot * K128); - size_t len; + cd->complete = true; + for (i = 0; i < 8; i++) { + if (cd->reqs[i]) { + req = cd->reqs[i]; + cd->reqs[i] = 0; + break; + } + } - DFUSE_TRA_DEBUG(cr->oh, "Replying for %ld[%d]", cd->bucket, cr->slot); + D_SPIN_UNLOCK(&ia->lock); - /* Delete from the list before replying as there's no reference held otherwise */ - d_list_del(&cr->req_list); + if (req) { + size_t position = (cd->bucket * CHUNK_SIZE) + (i * K128); - if (cd->rc != 0) { - DFUSE_REPLY_ERR_RAW(cr->oh, cr->req, cd->rc); - } else { - if ((((cr->slot + 1) * K128) - 1) >= ev->de_len) - len = max(ev->de_len - (cr->slot * K128), 0); - else - len = K128; - - DFUSE_TRA_DEBUG(cr->oh, "%#zx-%#zx read", position, position + len - 1); - DFUSE_REPLY_BUFQ(cr->oh, cr->req, ev->de_iov.iov_buf + (cr->slot * K128), - len); + if (cd->rc != 0) { + DFUSE_REPLY_ERR_RAW(cd->ohs[i], req, cd->rc); + } else { + DFUSE_TRA_DEBUG(cd->ohs[i], "%#zx-%#zx read", position, + position + K128 - 1); + DFUSE_REPLY_BUFQ(cd->ohs[i], req, ev->de_iov.iov_buf + (i * K128), + K128); + } + + if (atomic_fetch_add_relaxed(&cd->exited, 1) == 7) + done = true; } - D_FREE(cr); + } while (req && !done); + + if (done) { + d_slab_release(cd->eqt->de_read_slab, cd->ev); + D_FREE(cd); } } +/* Submut a read to dfs. + * + * Returns true on success. + */ +static bool +chunk_fetch(fuse_req_t req, struct dfuse_obj_hdl *oh, struct read_chunk_data *cd, int slot) +{ + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_inode_entry *ie = oh->doh_ie; + struct dfuse_event *ev; + struct dfuse_eq *eqt; + int rc; + daos_off_t position = cd->bucket * CHUNK_SIZE; + + eqt = pick_eqt(dfuse_info); + + ev = d_slab_acquire(eqt->de_read_slab); + if (ev == NULL) { + cd->rc = ENOMEM; + return false; + } + + ev->de_iov.iov_len = CHUNK_SIZE; + ev->de_req = req; + ev->de_cd = cd; + ev->de_sgl.sg_nr = 1; + ev->de_len = 0; + ev->de_complete_cb = chunk_cb; + + cd->ev = ev; + cd->eqt = eqt; + cd->reqs[slot] = req; + cd->ohs[slot] = oh; + + rc = dfs_read(ie->ie_dfs->dfs_ns, ie->ie_obj, &ev->de_sgl, position, &ev->de_len, + &ev->de_ev); + if (rc != 0) + goto err; + + /* Send a message to the async thread to wake it up and poll for events */ + sem_post(&eqt->de_sem); + + /* Now ensure there are more descriptors for the next request */ + d_slab_restock(eqt->de_read_slab); + + return true; + +err: + daos_event_fini(&ev->de_ev); + d_slab_release(eqt->de_read_slab, ev); + cd->rc = rc; + return false; +} + /* Try and do a bulk read. * * Returns true if it was able to handle the read. @@ -288,13 +384,11 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) { struct dfuse_inode_entry *ie = oh->doh_ie; struct read_chunk_data *cd; - struct read_chunk_req *cr = NULL; off_t last; uint64_t bucket; int slot; bool submit = false; bool rcb; - bool all_done = true; if (len != K128) return false; @@ -319,6 +413,7 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) d_list_for_each_entry(cd, &ie->ie_active->chunks, list) if (cd->bucket == bucket) { + /* Remove from list to re-add again later. */ d_list_del(&cd->list); goto found; } @@ -327,121 +422,57 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) if (cd == NULL) goto err; - D_ALLOC_PTR(cr); - if (cr == NULL) { - D_FREE(cd); - goto err; - } - - D_INIT_LIST_HEAD(&cd->req_list); cd->ia = ie->ie_active; cd->bucket = bucket; submit = true; found: - for (int i = 0; i < 8; i++) { - if (!cd->slot_done[i]) - all_done = false; + if (++cd->entered < 8) { + /* Put on front of list for efficient searching */ + d_list_add(&cd->list, &ie->ie_active->chunks); } - if (all_done) - d_list_add(&cd->list, &ie->ie_active->chunks); - else - d_list_add_tail(&cd->list, &ie->ie_active->chunks); + D_SPIN_UNLOCK(&ie->ie_active->lock); if (submit) { - struct dfuse_info *dfuse_info = fuse_req_userdata(req); - struct dfuse_eq *eqt; - struct dfuse_event *ev; - int rc; - - /* Overwrite position here to the start of the bucket */ - position = cd->bucket * CHUNK_SIZE; - - eqt = pick_eqt(dfuse_info); - - ev = d_slab_acquire(eqt->de_read_slab); - if (ev == NULL) { - d_list_del(&cd->list); - D_FREE(cr); - D_FREE(cd); - goto err; - } - - d_list_add(&cr->req_list, &cd->req_list); + DFUSE_TRA_DEBUG(oh, "submit for bucket %ld[%d]", bucket, slot); + rcb = chunk_fetch(req, oh, cd, slot); + } else { + struct dfuse_event *ev = NULL; /* Now check if this read request is complete or not yet, if it isn't then just * save req in the right slot however if it is then reply here. After the call to * DFUSE_REPLY_* then no reference is held on either the open file or the inode so * at that point they could be closed. */ + rcb = true; - DFUSE_TRA_DEBUG(oh, "submit for bucket %ld[%d]", bucket, slot); - D_SPIN_UNLOCK(&ie->ie_active->lock); - - cd->eqt = eqt; - cd->ev = ev; - - cr->req = req; - cr->oh = oh; - cr->slot = slot; - - ev->de_iov.iov_len = CHUNK_SIZE; - ev->de_req = req; - ev->de_cd = cd; - ev->de_sgl.sg_nr = 1; - ev->de_len = 0; - ev->de_complete_cb = chunk_cb; - - rc = dfs_read(ie->ie_dfs->dfs_ns, ie->ie_obj, &ev->de_sgl, position, &ev->de_len, - &ev->de_ev); - if (rc == 0) { - /* Send a message to the async thread to wake it up and poll for events */ - sem_post(&eqt->de_sem); + D_SPIN_LOCK(&ie->ie_active->lock); + if (cd->complete) { + ev = cd->ev; } else { - ev->de_ev.ev_error = rc; - chunk_cb(ev); + cd->reqs[slot] = req; + cd->ohs[slot] = oh; } - - rcb = true; - } else if (cd->complete) { - cd->slot_done[slot] = true; - D_SPIN_UNLOCK(&ie->ie_active->lock); - if (cd->rc != 0) { - /* Don't pass fuse an error here, rather return false and - * the read will be tried over the network. - */ - rcb = false; - } else { - size_t read_len; - - if ((((slot + 1) * K128) - 1) >= cd->ev->de_len) - read_len = max(cd->ev->de_len - (slot * K128), 0); - else - read_len = K128; - - oh->doh_linear_read_pos = max(oh->doh_linear_read_pos, position + read_len); - - DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", position, position + read_len - 1); - DFUSE_REPLY_BUFQ(oh, req, cd->ev->de_iov.iov_buf + (slot * K128), read_len); - rcb = true; - } - } else { - rcb = false; - - D_ALLOC_PTR(cr); - if (cr) { - cr->req = req; - cr->oh = oh; - cr->slot = slot; - d_list_add_tail(&cr->req_list, &cd->req_list); - rcb = true; + if (ev) { + if (cd->rc != 0) { + /* Don't pass fuse an error here, rather return false and the read + * will be tried over the network. + */ + rcb = false; + } else { + DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", position, + position + K128 - 1); + DFUSE_REPLY_BUFQ(oh, req, ev->de_iov.iov_buf + (slot * K128), K128); + } + if (atomic_fetch_add_relaxed(&cd->exited, 1) == 7) { + d_slab_release(cd->eqt->de_read_slab, cd->ev); + D_FREE(cd); + } } - - D_SPIN_UNLOCK(&ie->ie_active->lock); } return rcb; @@ -524,6 +555,26 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct DFUSE_IE_WFLUSH(oh->doh_ie); + /* Check for open matching reads, if there are multiple readers of the same file offset + * then chain future requests off the first one to avoid extra network round-trips. This + * can and does happen even with caching enabled if there are multiple client processes. + */ + D_SPIN_LOCK(&active->lock); + { + struct dfuse_event *evc; + + d_list_for_each_entry(evc, &active->open_reads, de_read_list) { + if (ev->de_req_position == evc->de_req_position && + ev->de_req_len <= evc->de_req_len) { + d_list_add(&ev->de_read_list, &evc->de_read_slaves); + D_SPIN_UNLOCK(&active->lock); + return; + } + } + d_list_add_tail(&ev->de_read_list, &active->open_reads); + } + D_SPIN_UNLOCK(&active->lock); + rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) { ev->de_ev.ev_error = rc; From ae9b7864d95439d88a47f901196403c3affc91f5 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 19:38:43 +0000 Subject: [PATCH 07/26] DAOS-15626 dfuse: Improve linear-read detection code Fix a bug where linear read was not correctly saved to the directory. Improve the NLT testing of pre_read to not just invoke it but to use the statistics to verify correct operation. Required-githooks: true Signed-off-by: Ashley Pittman --- src/client/dfuse/dfuse.h | 22 ++++++++++----- src/client/dfuse/file.c | 20 +++++++++++--- src/client/dfuse/ops/open.c | 23 ++++------------ src/client/dfuse/ops/read.c | 21 ++++++++------ utils/node_local_test.py | 55 ++++++++++++++++++++++++++++++------- 5 files changed, 94 insertions(+), 47 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 80ce2d7f844..1582f17329c 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -167,17 +167,24 @@ struct dfuse_obj_hdl { /* Pointer to the last returned drc entry */ struct dfuse_readdir_c *doh_rd_nextc; - /* Linear read function, if a file is read from start to end then this normally requires - * a final read request at the end of the file that returns zero bytes. Detect this case - * and when the final read is detected then just return without a round trip. - * Store a flag for this being enabled (starts as true, but many I/O patterns will set it - * to false), the expected position of the next read and a boolean for if EOF has been - * detected. + /* Linear read tracking. If a file is opened and read from start to finish then this is + * called a linear read, linear reads however may or may not read EOF at the end of a file, + * as the reader may be checking the file size. + * + * Detect this case and track it at the file handle level, this is then used in two places: + * For read of EOF it means the round-trip can be avoided. + * On release we can use this flag to apply a setting to the directory inode. + * + * This flag starts enabled and many I/O patterns will disable it. We also store the next + * expected read position and if EOF has been reached. */ + off_t doh_linear_read_pos; bool doh_linear_read; bool doh_linear_read_eof; + bool doh_set_linear_read; + /** True if caching is enabled for this file. */ bool doh_caching; @@ -1026,6 +1033,7 @@ struct active_inode { d_list_t chunks; d_list_t open_reads; pthread_spinlock_t lock; + ATOMIC uint64_t read_count; struct dfuse_pre_read *readahead; }; @@ -1034,7 +1042,7 @@ int active_ie_init(struct dfuse_inode_entry *ie, bool *preread); /* Mark a oh as closing and drop the ref on inode active */ -bool +void active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh); /* Decrease active count on inode, called on error where there is no oh */ diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index 028685340d7..e612d759b54 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -42,6 +42,7 @@ active_ie_init(struct dfuse_inode_entry *ie, bool *preread) } D_INIT_LIST_HEAD(&ie->ie_active->chunks); D_INIT_LIST_HEAD(&ie->ie_active->open_reads); + atomic_init(&ie->ie_active->read_count, 0); if (preread && *preread) { D_ALLOC_PTR(ie->ie_active->readahead); if (ie->ie_active->readahead) { @@ -81,11 +82,10 @@ ah_free(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) dfuse_inode_decref(dfuse_info, ie); } -bool +void active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) { uint32_t oc; - bool rcb = true; D_MUTEX_LOCK(&alock); @@ -94,15 +94,27 @@ active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) DFUSE_TRA_DEBUG(oh->doh_ie, "Decref to %d", oc - 1); + /* Leave set_linear_read as false in this case */ if (oc != 1) goto out; - rcb = read_chunk_close(oh->doh_ie); + if (read_chunk_close(oh->doh_ie)) + oh->doh_linear_read = true; + + /* Do not set linear read in the case where there's no reads or writes, this could be + * simple open/close calls but it could also be cache use so leave the setting unchanged + * in this case. + */ + if (oh->doh_linear_read) { + if (oh->doh_ie->ie_active->read_count != 0) + oh->doh_set_linear_read = true; + } else { + oh->doh_set_linear_read = true; + } ah_free(dfuse_info, oh->doh_ie); out: D_MUTEX_UNLOCK(&alock); - return rcb; } void diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 2cb6a12a2bf..ab8e1b3988d 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -198,8 +198,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) atomic_fetch_add_relaxed(&ie->ie_ref, 1); } - if (active_oh_decref(dfuse_info, oh)) - oh->doh_linear_read = true; + active_oh_decref(dfuse_info, oh); rc = dfs_release(oh->doh_obj); if (rc == 0) { @@ -209,25 +208,13 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) oh->doh_ie = NULL; } if (oh->doh_parent_dir) { - bool use_linear_read = false; - bool set_linear_read = true; - - if (oh->doh_linear_read) { - /* If the file was not read from then this could indicate a cached read - * so do not disable pre-read for the directory. - */ - if (!oh->doh_linear_read_eof) - set_linear_read = false; - use_linear_read = true; - } - - if (set_linear_read) { + if (oh->doh_set_linear_read) { DFUSE_TRA_DEBUG(oh->doh_parent_dir, "Setting linear_read to %d", - use_linear_read); + oh->doh_linear_read); - atomic_store_relaxed(&oh->doh_parent_dir->ie_linear_read, use_linear_read); + atomic_store_relaxed(&oh->doh_parent_dir->ie_linear_read, + oh->doh_linear_read); } - dfuse_inode_decref(dfuse_info, oh->doh_parent_dir); } if (ie) { diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 05f7d9bb9ef..0ea49f0e089 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -229,7 +229,8 @@ chunk_free(struct read_chunk_data *cd) } /* Called when the last open file handle on a inode is closed. This needs to free everything which - * is complete and for anything that isn't flag it for deletion in the callback. + * is complete and for anything that isn't flag it for deletion in the callback. No locking is + * needed here as this is only called as active is being released. * * Returns true if the feature was used. */ @@ -495,10 +496,11 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct DFUSE_IE_STAT_ADD(oh->doh_ie, DS_READ); + atomic_fetch_add_relaxed(&oh->doh_ie->ie_active->read_count, 1); + if (oh->doh_linear_read_eof && position == oh->doh_linear_read_pos) { DFUSE_TRA_DEBUG(oh, "Returning EOF early without round trip %#zx", position); oh->doh_linear_read_eof = false; - oh->doh_linear_read = false; if (active->readahead) DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); @@ -515,10 +517,8 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct eqt = pick_eqt(dfuse_info); ev = d_slab_acquire(eqt->de_read_slab); - if (ev == NULL) { - DFUSE_REPLY_ERR_RAW(oh, req, ENOMEM); - return; - } + if (ev == NULL) + D_GOTO(err, rc = ENOMEM); if (oh->doh_ie->ie_truncated && position + len < oh->doh_ie->ie_stat.st_size && ((oh->doh_ie->ie_start_off == 0 && oh->doh_ie->ie_end_off == 0) || @@ -577,8 +577,7 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) { - ev->de_ev.ev_error = rc; - dfuse_cb_read_complete(ev); + D_GOTO(err, rc); return; } @@ -589,6 +588,12 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct d_slab_restock(eqt->de_read_slab); return; +err: + DFUSE_REPLY_ERR_RAW(oh, req, rc); + if (ev) { + daos_event_fini(&ev->de_ev); + d_slab_release(eqt->de_read_slab, ev); + } } static void diff --git a/utils/node_local_test.py b/utils/node_local_test.py index f29cd3f7ed7..950c1a6dcf5 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -1560,17 +1560,17 @@ def run_query(self, use_json=False, quiet=False): return rc def check_usage(self, ino=None, inodes=None, open_files=None, pools=None, containers=None, - qpath=None): + qpath=None, old=None): """Query and verify the dfuse statistics. + Optionally verify numbers are as expected/defined or return a delta to a previous sample. Returns the raw numbers in a dict. """ cmd = ['filesystem', 'query', qpath or self.dir] if ino is not None: cmd.extend(['--inode', str(ino)]) - rc = run_daos_cmd(self.conf, cmd, use_json=True) - print(rc) + rc = run_daos_cmd(self.conf, cmd, use_json=True, valgrind=False, log_check=False) assert rc.returncode == 0, rc if inodes: @@ -1581,6 +1581,16 @@ def check_usage(self, ino=None, inodes=None, open_files=None, pools=None, contai assert rc.json['response']['pools'] == pools, rc if containers: assert rc.json['response']['containers'] == containers, rc + + # If a prior version of the statistics was supplied then take a delta, but take a delta of + # the prior version as it came from daos, not as it was reported. + if 'statistics' in rc.json['response']: + rc.json['response']['raw'] = copy.deepcopy(rc.json['response']['statistics']) + if old: + for key in rc.json['response']['statistics']: + rc.json['response']['statistics'][key] -= old['raw'].get(key, 0) + + print(f"Usage: {rc.json['response']}") return rc.json['response'] def _evict_path(self, path): @@ -1604,7 +1614,7 @@ def evict_and_wait(self, paths, qpath=None): for inode in inodes: found = True while found: - rc = self.check_usage(inode, qpath=qpath) + rc = self.check_usage(ino=inode, qpath=qpath) print(rc) found = rc['resident'] if not found: @@ -2312,7 +2322,7 @@ def test_read(self): def test_pre_read(self): """Test the pre-read code. - Test reading a file which is previously unknown to fuse with caching on. This should go + Test reading a files which are previously unknown to fuse with caching on. This should go into the pre_read code and load the file contents automatically after the open call. """ dfuse = DFuse(self.server, self.conf, container=self.container) @@ -2341,32 +2351,57 @@ def test_pre_read(self): dfuse = DFuse(self.server, self.conf, caching=True, container=self.container) dfuse.start(v_hint='pre_read_1') + # Open a file and read in one go. with open(join(dfuse.dir, 'file0'), 'r') as fd: data0 = fd.read() + res = dfuse.check_usage() + assert res['statistics']['pre_read'] == 1, res + # Open a file and read in one go. with open(join(dfuse.dir, 'file1'), 'r') as fd: data1 = fd.read(16) + res = dfuse.check_usage(old=res) + assert res['statistics']['pre_read'] == 1, res + + # Open a file and read two bytes at a time. Despite disabling buffering python will try and + # read a whole page the first time. + fd = os.open(join(dfuse.dir, 'file2'), os.O_RDONLY) + data2 = os.read(fd, 2) + res = dfuse.check_usage(old=res) + assert res['statistics']['pre_read'] == 1, res + _ = os.read(fd, 2) + res = dfuse.check_usage(old=res) + assert res['statistics']['pre_read'] == 0, res + os.close(fd) - with open(join(dfuse.dir, 'file2'), 'r') as fd: - data2 = fd.read(2) - + # Open a MB file. This reads 8 128k chunks and 1 EOF. with open(join(dfuse.dir, 'file3'), 'r') as fd: data3 = fd.read() + res = dfuse.check_usage(old=res) + assert res['statistics']['pre_read'] == 9, res + # Open a (1MB-1) file. This reads 8 128k chunks, the last is truncated. There is no EOF + # returned by dfuse here, just a truncated read but I assume python is interpreting a + # truncated read at the expected file size as an EOF. with open(join(dfuse.dir, 'file4'), 'r') as fd: data4 = fd.read() data5 = fd.read() - # This should not use the pre-read feature, to be validated via the logs. + res = dfuse.check_usage(old=res) + assert res['statistics']['pre_read'] == 8, res + + # This should now be read from cache. with open(join(dfuse.dir, 'file4'), 'r') as fd: data6 = fd.read() + res = dfuse.check_usage(old=res) + assert res['statistics']['read'] == 0, res if dfuse.stop(): self.fatal_errors = True print(data0) assert data0 == 'test' assert data1 == 'test' - assert data2 == 'te' + assert data2 == b'te', data2 assert raw_data0 == data3 assert raw_data1 == data4 assert len(data5) == 0 From fbe54835a2e2361c300773da363a109f7fee9197 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 19:41:20 +0000 Subject: [PATCH 08/26] DAOS-16686 dfuse: avoid duplicate RPC between readahead and read Add readahead RPC in the open read list earlier to make sure the following read will not send duplicate RPC. Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/dfuse.h | 20 ++++- src/client/dfuse/file.c | 33 +++++--- src/client/dfuse/ops/create.c | 2 +- src/client/dfuse/ops/lookup.c | 2 +- src/client/dfuse/ops/open.c | 66 ++++++++++----- src/client/dfuse/ops/opendir.c | 2 +- src/client/dfuse/ops/read.c | 146 +++++++++++++++++++-------------- 7 files changed, 172 insertions(+), 99 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 1582f17329c..a6e7152fbfc 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -95,6 +95,9 @@ struct dfuse_eq { * memory consumption */ #define DFUSE_MAX_PRE_READ (1024 * 1024 * 4) +/* Maximum file-size for pre-read in all cases */ +#define DFUSE_MAX_PRE_READ_ONCE (1024 * 1024 * 1) + /* Launch fuse, and do not return until complete */ int dfuse_launch_fuse(struct dfuse_info *dfuse_info, struct fuse_args *args); @@ -202,6 +205,8 @@ struct dfuse_obj_hdl { bool doh_kreaddir_finished; bool doh_evict_on_close; + /* the handle is doing readhead for the moment */ + bool doh_readahead_inflight; }; /* Readdir support. @@ -1039,7 +1044,10 @@ struct active_inode { /* Increase active count on inode. This takes a reference and allocates ie->active as required */ int -active_ie_init(struct dfuse_inode_entry *ie, bool *preread); +active_ie_init(struct dfuse_inode_entry *ie); + +int +active_ie_readahead_init(struct dfuse_inode_entry *ie); /* Mark a oh as closing and drop the ref on inode active */ void @@ -1214,7 +1222,15 @@ bool dfuse_dcache_get_valid(struct dfuse_inode_entry *ie, double max_age); void -dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh); +dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, struct dfuse_event *ev); + +int +dfuse_pre_read_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, + struct dfuse_event **evp); + +void +dfuse_pre_read_abort(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, + struct dfuse_event *ev, int rc); int check_for_uns_ep(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, char *attr, diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index e612d759b54..439359a422c 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -14,7 +14,7 @@ static pthread_mutex_t alock = PTHREAD_MUTEX_INITIALIZER; /* Perhaps combine with dfuse_open_handle_init? */ int -active_ie_init(struct dfuse_inode_entry *ie, bool *preread) +active_ie_init(struct dfuse_inode_entry *ie) { uint32_t oc; int rc = -DER_SUCCESS; @@ -25,11 +25,8 @@ active_ie_init(struct dfuse_inode_entry *ie, bool *preread) DFUSE_TRA_DEBUG(ie, "Addref to %d", oc + 1); - if (oc != 0) { - if (preread && *preread) - *preread = false; + if (oc != 0) goto out; - } D_ALLOC_PTR(ie->ie_active); if (!ie->ie_active) @@ -43,13 +40,6 @@ active_ie_init(struct dfuse_inode_entry *ie, bool *preread) D_INIT_LIST_HEAD(&ie->ie_active->chunks); D_INIT_LIST_HEAD(&ie->ie_active->open_reads); atomic_init(&ie->ie_active->read_count, 0); - if (preread && *preread) { - D_ALLOC_PTR(ie->ie_active->readahead); - if (ie->ie_active->readahead) { - D_INIT_LIST_HEAD(&ie->ie_active->readahead->req_list); - atomic_fetch_add_relaxed(&ie->ie_open_count, 1); - } - } /* Take a reference on the inode to prevent it being released */ atomic_fetch_add_relaxed(&ie->ie_ref, 1); out: @@ -57,6 +47,25 @@ active_ie_init(struct dfuse_inode_entry *ie, bool *preread) return rc; } +int +active_ie_readahead_init(struct dfuse_inode_entry *ie) +{ + struct active_inode *ie_active = ie->ie_active; + + D_ASSERT(ie_active != NULL); + if (ie_active->readahead != NULL) + return 0; + + D_ALLOC_PTR(ie_active->readahead); + if (ie_active->readahead == NULL) + return -DER_NOMEM; + + D_INIT_LIST_HEAD(&ie_active->readahead->req_list); + atomic_fetch_add_relaxed(&ie->ie_open_count, 1); + + return 0; +} + static void ah_free(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) { diff --git a/src/client/dfuse/ops/create.c b/src/client/dfuse/ops/create.c index d42315d8dd5..5549d17113b 100644 --- a/src/client/dfuse/ops/create.c +++ b/src/client/dfuse/ops/create.c @@ -217,7 +217,7 @@ dfuse_cb_create(fuse_req_t req, struct dfuse_inode_entry *parent, const char *na dfuse_compute_inode(dfs, &ie->ie_oid, &ie->ie_stat.st_ino); - rc = active_ie_init(ie, NULL); + rc = active_ie_init(ie); if (rc != -DER_SUCCESS) goto drop_oh; diff --git a/src/client/dfuse/ops/lookup.c b/src/client/dfuse/ops/lookup.c index 77bcc9bb0da..ac095eed4bc 100644 --- a/src/client/dfuse/ops/lookup.c +++ b/src/client/dfuse/ops/lookup.c @@ -92,7 +92,7 @@ dfuse_reply_entry(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, if (ie->ie_active) { D_ASSERT(atomic_load_relaxed(&ie->ie_open_count) == 1); active_ie_decref(dfuse_info, ie); - rc = active_ie_init(inode, NULL); + rc = active_ie_init(inode); if (rc != -DER_SUCCESS) { atomic_fetch_sub_relaxed(&ie->ie_ref, 1); dfuse_ie_close(dfuse_info, ie); diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index ab8e1b3988d..384a33bcea6 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -14,9 +14,9 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct dfuse_inode_entry *ie; struct dfuse_obj_hdl *oh; struct fuse_file_info fi_out = {0}; + struct dfuse_event *ev; + bool preread = false; int rc; - bool prefetch = false; - bool preread = false; int flags; ie = dfuse_inode_lookup_nf(dfuse_info, ino); @@ -57,6 +57,10 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if ((fi->flags & O_ACCMODE) != O_RDONLY) oh->doh_writeable = true; + rc = active_ie_init(ie); + if (rc) + goto err; + if (ie->ie_dfs->dfc_data_timeout != 0) { if (fi->flags & O_DIRECT) fi_out.direct_io = 1; @@ -68,12 +72,35 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) */ /* TODO: This probably wants reflowing to not reference ie_open_count */ - if (atomic_load_relaxed(&ie->ie_open_count) > 0 || + if (atomic_load_relaxed(&ie->ie_open_count) > 1 || ((ie->ie_dcache_last_update.tv_sec != 0) && dfuse_dcache_get_valid(ie, ie->ie_dfs->dfc_data_timeout))) { fi_out.keep_cache = 1; } else { - prefetch = true; + D_SPIN_LOCK(&ie->ie_active->lock); + /** + * size > 4M no pre-read + * 1M <= size <= 4M depend on other files under the directory. + * size <= 1M pre-read in any case. + */ + if ((oh->doh_parent_dir && + atomic_load_relaxed(&oh->doh_parent_dir->ie_linear_read) && + ie->ie_stat.st_size > 0 && + ie->ie_stat.st_size <= DFUSE_MAX_PRE_READ) || + (ie->ie_stat.st_size > 0 && + ie->ie_stat.st_size <= DFUSE_MAX_PRE_READ_ONCE)) { + preread = true; + /* Add the read extent to the list to make sure the following read + * will check the readahead list first. + */ + rc = dfuse_pre_read_init(dfuse_info, ie, &ev); + if (rc != 0) { + D_SPIN_UNLOCK(&ie->ie_active->lock); + D_GOTO(decref, rc); + } + oh->doh_readahead_inflight = 1; + } + D_SPIN_UNLOCK(&ie->ie_active->lock); } } else if (ie->ie_dfs->dfc_data_otoc) { /* Open to close caching, this allows the use of shared mmap */ @@ -93,18 +120,6 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) oh->doh_caching = true; fi_out.fh = (uint64_t)oh; - - /* Pre-read this for files up to the max read size. */ - if (prefetch && oh->doh_parent_dir && - atomic_load_relaxed(&oh->doh_parent_dir->ie_linear_read) && ie->ie_stat.st_size > 0 && - ie->ie_stat.st_size <= DFUSE_MAX_PRE_READ) { - preread = true; - } - - rc = active_ie_init(ie, &preread); - if (rc) - goto err; - /* * dfs_dup() just locally duplicates the file handle. If we have * O_TRUNC flag, we need to truncate the file manually. @@ -112,7 +127,7 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (fi->flags & O_TRUNC) { rc = dfs_punch(ie->ie_dfs->dfs_ns, ie->ie_obj, 0, DFS_MAX_FSIZE); if (rc) - D_GOTO(decref, rc); + D_GOTO(ie_decref, rc); dfuse_dcache_evict(oh->doh_ie); } @@ -122,9 +137,11 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) * release from completing which also holds open the inode. */ if (preread) - dfuse_pre_read(dfuse_info, oh); + dfuse_pre_read(dfuse_info, oh, ev); return; +ie_decref: + dfuse_pre_read_abort(dfuse_info, oh, ev, rc); decref: active_ie_decref(dfuse_info, ie); err: @@ -149,8 +166,6 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) * but the inode only tracks number of open handles with non-zero ioctl counts */ - D_ASSERT(oh->doh_ie->ie_active); - DFUSE_TRA_DEBUG(oh, "Closing %d", oh->doh_caching); DFUSE_IE_WFLUSH(oh->doh_ie); @@ -193,6 +208,17 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) atomic_fetch_sub_relaxed(&oh->doh_ie->ie_il_count, 1); } + /* Wait inflight readahead RPC finished before release */ + if (oh->doh_ie->ie_active != NULL) { +wait_readahead: + D_SPIN_LOCK(&oh->doh_ie->ie_active->lock); + if (oh->doh_readahead_inflight) { + D_SPIN_UNLOCK(&oh->doh_ie->ie_active->lock); + goto wait_readahead; + } + D_SPIN_UNLOCK(&oh->doh_ie->ie_active->lock); + } + if (oh->doh_evict_on_close) { ie = oh->doh_ie; atomic_fetch_add_relaxed(&ie->ie_ref, 1); diff --git a/src/client/dfuse/ops/opendir.c b/src/client/dfuse/ops/opendir.c index 60d1e6ab4d5..d41f842c2ba 100644 --- a/src/client/dfuse/ops/opendir.c +++ b/src/client/dfuse/ops/opendir.c @@ -19,7 +19,7 @@ dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_ if (!oh) D_GOTO(err, rc = ENOMEM); - rc = active_ie_init(ie, NULL); + rc = active_ie_init(ie); if (rc != -DER_SUCCESS) D_GOTO(free, rc = daos_der2errno(rc)); diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 0ea49f0e089..8579386a8cf 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -46,30 +46,40 @@ cb_read_helper(struct dfuse_event *ev, void *buff) daos_event_fini(&ev->de_ev); } +/* read slave complete */ static void -dfuse_cb_read_complete(struct dfuse_event *ev) +dfuse_cb_slave_read_complete(struct dfuse_event *ev) { struct dfuse_event *evs, *evn; + d_list_t cblist; + char *buf = ev->de_iov.iov_buf; + D_INIT_LIST_HEAD(&cblist); D_SPIN_LOCK(&ev->de_oh->doh_ie->ie_active->lock); d_list_del(&ev->de_read_list); + d_list_for_each_entry_safe(evs, evn, &ev->de_read_slaves, de_read_list) + d_list_move(&evs->de_read_list, &cblist); D_SPIN_UNLOCK(&ev->de_oh->doh_ie->ie_active->lock); - d_list_for_each_entry(evs, &ev->de_read_slaves, de_read_list) { + d_list_for_each_entry(evs, &cblist, de_read_list) { DFUSE_TRA_DEBUG(ev->de_oh, "concurrent network read %p", evs->de_oh); + d_list_del(&evs->de_read_list); evs->de_len = min(ev->de_len, evs->de_req_len); evs->de_ev.ev_error = ev->de_ev.ev_error; - cb_read_helper(evs, ev->de_iov.iov_buf); - } - - cb_read_helper(ev, ev->de_iov.iov_buf); - - d_list_for_each_entry_safe(evs, evn, &ev->de_read_slaves, de_read_list) { - d_list_del(&evs->de_read_list); + D_ASSERT(evs->de_req_position >= ev->de_req_position); + cb_read_helper(evs, buf + (evs->de_req_position - ev->de_req_position)); d_slab_restock(evs->de_eqt->de_read_slab); d_slab_release(evs->de_eqt->de_read_slab, evs); } +} +static void +dfuse_cb_read_complete(struct dfuse_event *ev) +{ + char *buf = ev->de_iov.iov_buf; + + dfuse_cb_slave_read_complete(ev); + cb_read_helper(ev, buf); d_slab_restock(ev->de_eqt->de_read_slab); d_slab_release(ev->de_eqt->de_read_slab, ev); } @@ -121,32 +131,14 @@ dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_o { struct active_inode *active = oh->doh_ie->ie_active; - D_SPIN_LOCK(&active->lock); - if (!active->readahead->complete) { - struct read_req *rr; - - D_ALLOC_PTR(rr); - if (!rr) { - D_SPIN_UNLOCK(&active->lock); - return false; - } - rr->req = req; - rr->len = len; - rr->position = position; - rr->oh = oh; - d_list_add_tail(&rr->list, &active->readahead->req_list); - D_SPIN_UNLOCK(&active->lock); - return true; - } - D_SPIN_UNLOCK(&active->lock); - if (active->readahead->dra_rc) { DFUSE_REPLY_ERR_RAW(oh, req, active->readahead->dra_rc); return true; } - if (!oh->doh_linear_read || active->readahead->dra_ev == NULL) { - DFUSE_TRA_DEBUG(oh, "Pre read disabled"); + if (!oh->doh_linear_read || active->readahead->dra_ev == NULL || + !active->readahead->complete) { + DFUSE_TRA_DEBUG(oh, "Pre read disabled or not completed"); return false; } @@ -490,7 +482,7 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct struct active_inode *active = oh->doh_ie->ie_active; struct dfuse_info *dfuse_info = fuse_req_userdata(req); bool mock_read = false; - struct dfuse_eq *eqt; + struct dfuse_eq *eqt = NULL; int rc; struct dfuse_event *ev; @@ -508,6 +500,7 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct return; } + /* Then check if the request can be filled by readahead */ if (active->readahead && dfuse_readahead_reply(req, len, position, oh)) return; @@ -538,6 +531,8 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct ev->de_iov.iov_buf_len); } + if (position + len > oh->doh_ie->ie_stat.st_size) + len = oh->doh_ie->ie_stat.st_size - position; ev->de_iov.iov_len = len; ev->de_req = req; ev->de_sgl.sg_nr = 1; @@ -564,11 +559,11 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct struct dfuse_event *evc; d_list_for_each_entry(evc, &active->open_reads, de_read_list) { - if (ev->de_req_position == evc->de_req_position && + if (ev->de_req_position >= evc->de_req_position && ev->de_req_len <= evc->de_req_len) { d_list_add(&ev->de_read_list, &evc->de_read_slaves); D_SPIN_UNLOCK(&active->lock); - return; + goto out; } } d_list_add_tail(&ev->de_read_list, &active->open_reads); @@ -580,7 +575,7 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct D_GOTO(err, rc); return; } - +out: /* Send a message to the async thread to wake it up and poll for events */ sem_post(&eqt->de_sem); @@ -599,29 +594,19 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct static void pre_read_mark_done(struct active_inode *active) { - struct read_req *rr, *rrn; - D_SPIN_LOCK(&active->lock); active->readahead->complete = true; D_SPIN_UNLOCK(&active->lock); - - /* No lock is held here as after complete is set then nothing further is added */ - d_list_for_each_entry_safe(rr, rrn, &active->readahead->req_list, list) { - d_list_del(&rr->list); - readahead_actual_reply(active, rr); - D_FREE(rr); - } } static void dfuse_cb_pre_read_complete(struct dfuse_event *ev) { struct dfuse_info *dfuse_info = ev->de_di; - struct dfuse_inode_entry *ie = ev->de_ie; + struct dfuse_inode_entry *ie = ev->de_oh->doh_ie; struct active_inode *active = ie->ie_active; active->readahead->dra_rc = ev->de_ev.ev_error; - if (ev->de_ev.ev_error != 0) { active->readahead->dra_rc = ev->de_ev.ev_error; daos_event_fini(&ev->de_ev); @@ -639,35 +624,79 @@ dfuse_cb_pre_read_complete(struct dfuse_event *ev) active->readahead->dra_ev = NULL; } pre_read_mark_done(active); + ev->de_oh->doh_readahead_inflight = 0; + + dfuse_cb_slave_read_complete(ev); /* Drop the extra ref on active, the file could be closed before this read completes */ active_ie_decref(dfuse_info, ie); } -void -dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) +int +dfuse_pre_read_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, + struct dfuse_event **evp) { - struct active_inode *active = oh->doh_ie->ie_active; - struct dfuse_eq *eqt; - int rc; + struct active_inode *active = ie->ie_active; + struct dfuse_eq *eqt; struct dfuse_event *ev; - size_t len = oh->doh_ie->ie_stat.st_size; + size_t len = ie->ie_stat.st_size; eqt = pick_eqt(dfuse_info); ev = d_slab_acquire(eqt->de_pre_read_slab); if (ev == NULL) - D_GOTO(err, rc = ENOMEM); + return ENOMEM; ev->de_iov.iov_len = len; ev->de_req = 0; ev->de_sgl.sg_nr = 1; - ev->de_ie = oh->doh_ie; ev->de_readahead_len = len; ev->de_req_position = 0; - ev->de_di = dfuse_info; ev->de_complete_cb = dfuse_cb_pre_read_complete; + + if (active->readahead == NULL) { + int rc; + + rc = active_ie_readahead_init(ie); + if (rc != 0) + return rc; + } active->readahead->dra_ev = ev; + /* NB: the inode_entry has been locked by ie_read_lock */ + d_list_add_tail(&ev->de_read_list, &active->open_reads); + + *evp = ev; + return 0; +} + +void +dfuse_pre_read_abort(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, + struct dfuse_event *ev, int rc) +{ + struct dfuse_eq *eqt = pick_eqt(dfuse_info); + struct active_inode *active = oh->doh_ie->ie_active; + + oh->doh_readahead_inflight = 0; + active->readahead->dra_rc = rc; + if (ev) { + daos_event_fini(&ev->de_ev); + d_slab_release(eqt->de_pre_read_slab, ev); + active->readahead->dra_ev = NULL; + } + active_ie_decref(dfuse_info, oh->doh_ie); + pre_read_mark_done(active); +} + +void +dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, struct dfuse_event *ev) +{ + struct dfuse_eq *eqt; + int rc; + + eqt = pick_eqt(dfuse_info); + ev->de_oh = oh; + ev->de_di = dfuse_info; + rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, 0, &ev->de_len, &ev->de_ev); if (rc != 0) goto err; @@ -680,12 +709,5 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) return; err: - active->readahead->dra_rc = rc; - if (ev) { - daos_event_fini(&ev->de_ev); - d_slab_release(eqt->de_pre_read_slab, ev); - active->readahead->dra_ev = NULL; - } - active_ie_decref(dfuse_info, oh->doh_ie); - pre_read_mark_done(active); + dfuse_pre_read_abort(dfuse_info, oh, ev, rc); } From 58a2513c87c2175a6258302560c23268a3304667 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 19:45:02 +0000 Subject: [PATCH 09/26] DAOS-16686 dfuse: set time for new created entry Set dcache update timer for initialize timer, otherwise keep_cache flag will not be set for opendir, then concurrent opendir might truncate the directory page cache unnecessary. Set valid timer for inode entry created by readdirplus, otherwise the inode might needs to be lookup again. Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/ops/opendir.c | 8 +++++++- src/client/dfuse/ops/readdir.c | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/client/dfuse/ops/opendir.c b/src/client/dfuse/ops/opendir.c index d41f842c2ba..1a5f488f326 100644 --- a/src/client/dfuse/ops/opendir.c +++ b/src/client/dfuse/ops/opendir.c @@ -35,7 +35,13 @@ dfuse_cb_opendir(fuse_req_t req, struct dfuse_inode_entry *ie, struct fuse_file_ if (ie->ie_dfs->dfc_dentry_timeout > 0) { fi_out.cache_readdir = 1; - if (dfuse_dcache_get_valid(ie, ie->ie_dfs->dfc_dentry_timeout)) + /** + * Set keep_check to 1 to avoid the dir cache being invalidated during + * concurrent opendir. + **/ + if ((ie->ie_dcache_last_update.tv_sec == 0 && + atomic_load_relaxed(&dfuse_info->di_fh_count) > 1) || + dfuse_dcache_get_valid(ie, ie->ie_dfs->dfc_dentry_timeout)) fi_out.keep_cache = 1; } diff --git a/src/client/dfuse/ops/readdir.c b/src/client/dfuse/ops/readdir.c index 10ea454fd42..c38007c84ae 100644 --- a/src/client/dfuse/ops/readdir.c +++ b/src/client/dfuse/ops/readdir.c @@ -502,6 +502,7 @@ dfuse_do_readdir(struct dfuse_info *dfuse_info, fuse_req_t req, struct dfuse_obj } set_entry_params(&entry, ie); + dfuse_mcache_set_time(ie); written = FADP(req, &reply_buff[buff_offset], size - buff_offset, drc->drc_name, &entry, drc->drc_next_offset); @@ -733,6 +734,7 @@ dfuse_do_readdir(struct dfuse_info *dfuse_info, fuse_req_t req, struct dfuse_obj } set_entry_params(&entry, ie); + dfuse_mcache_set_time(ie); written = FADP(req, &reply_buff[buff_offset], size - buff_offset, dre->dre_name, &entry, dre->dre_next_offset); From cc1b5044db8c8d18db0c9c3d87d7bc95d923b47e Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 19:50:33 +0000 Subject: [PATCH 10/26] DAOS-16686 dfuse: optimize open Do not need object open in dfuse_cb_open, since if the following fetch only needs to read from the kernel page cache, then the object layout is not needed at all. Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/dfuse.h | 2 ++ src/client/dfuse/ops/open.c | 12 +++++------- src/client/dfuse/ops/read.c | 18 ++++++++++++++++++ src/client/dfuse/ops/write.c | 7 +++++++ 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index a6e7152fbfc..223914f8442 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -207,6 +207,8 @@ struct dfuse_obj_hdl { bool doh_evict_on_close; /* the handle is doing readhead for the moment */ bool doh_readahead_inflight; + + int doh_flags; }; /* Readdir support. diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 384a33bcea6..4c0ca184fb3 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -45,15 +45,11 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) LOG_FLAGS(ie, fi->flags); flags = fi->flags; + oh->doh_flags = flags; if (flags & O_APPEND) flags &= ~O_APPEND; - /** duplicate the file handle for the fuse handle */ - rc = dfs_dup(ie->ie_dfs->dfs_ns, ie->ie_obj, flags, &oh->doh_obj); - if (rc) - D_GOTO(err, rc); - if ((fi->flags & O_ACCMODE) != O_RDONLY) oh->doh_writeable = true; @@ -159,7 +155,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct dfuse_info *dfuse_info = fuse_req_userdata(req); struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; struct dfuse_inode_entry *ie = NULL; - int rc; + int rc = 0; uint32_t il_calls; /* Perform the opposite of what the ioctl call does, always change the open handle count @@ -226,7 +222,9 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) active_oh_decref(dfuse_info, oh); - rc = dfs_release(oh->doh_obj); + if (oh->doh_obj != NULL) + rc = dfs_release(oh->doh_obj); + if (rc == 0) { DFUSE_REPLY_ZERO_OH(oh, req); } else { diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 8579386a8cf..9f03f8faa1c 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -570,6 +570,16 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct } D_SPIN_UNLOCK(&active->lock); + if (oh->doh_obj == NULL) { + /** duplicate the file handle for the fuse handle */ + rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); + if (rc) { + ev->de_ev.ev_error = rc; + dfuse_cb_read_complete(ev); + return; + } + } + rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) { D_GOTO(err, rc); @@ -694,6 +704,14 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, struct d int rc; eqt = pick_eqt(dfuse_info); + + if (oh->doh_obj == NULL) { + /** duplicate the file handle for the fuse handle */ + rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); + if (rc) + D_GOTO(err, rc); + } + ev->de_oh = oh; ev->de_di = dfuse_info; diff --git a/src/client/dfuse/ops/write.c b/src/client/dfuse/ops/write.c index 943f9b75e78..f0e89bf9ecf 100644 --- a/src/client/dfuse/ops/write.c +++ b/src/client/dfuse/ops/write.c @@ -101,6 +101,13 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p if (len + position > oh->doh_ie->ie_stat.st_size) oh->doh_ie->ie_stat.st_size = len + position; + if (oh->doh_obj == NULL) { + /** duplicate the file handle for the fuse handle */ + rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); + if (rc) + D_GOTO(err, rc); + } + rc = dfs_write(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_ev); if (rc != 0) D_GOTO(err, rc); From 866f0782ab6f866cdc8656fa936929019a109d90 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 19:52:44 +0000 Subject: [PATCH 11/26] DAOS-16686 dfuse: use chan patch to avoid some contention Use chan patch to avoid some contention from fuse kernel. Required-githooks: true Signed-off-by: Ashley Pittman --- src/client/dfuse/dfuse_thread.c | 9 ++++++--- utils/build.config | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/client/dfuse/dfuse_thread.c b/src/client/dfuse/dfuse_thread.c index 4e06e4335ce..d8de0ab3fe9 100644 --- a/src/client/dfuse/dfuse_thread.c +++ b/src/client/dfuse/dfuse_thread.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2020-2023 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -34,10 +34,11 @@ dfuse_do_work(void *arg) struct dfuse_thread *dt = arg; struct dfuse_tm *dtm = dt->dt_tm; int rc; + struct fuse_chan *chan = fuse_clone_chan(dtm->tm_se); while (!fuse_session_exited(dtm->tm_se)) { pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); - rc = fuse_session_receive_buf(dtm->tm_se, &dt->dt_fbuf); + rc = fuse_session_receive_buf_chan(dtm->tm_se, &dt->dt_fbuf, chan); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); if (rc == -EINTR) continue; @@ -52,9 +53,11 @@ dfuse_do_work(void *arg) if (atomic_load_relaxed(&dtm->tm_exit)) return NULL; - fuse_session_process_buf(dtm->tm_se, &dt->dt_fbuf); + fuse_session_process_buf_chan(dtm->tm_se, &dt->dt_fbuf, chan); } + fuse_chan_put(chan); + sem_post(&dtm->tm_finish); return NULL; diff --git a/utils/build.config b/utils/build.config index a14ad039c15..6313639fc29 100644 --- a/utils/build.config +++ b/utils/build.config @@ -3,7 +3,7 @@ component=daos [commit_versions] argobots=v1.1 -fuse=fuse-3.16.2 +fuse=a6a219f5344a5c09cec34416818342ac220a0df2 pmdk=2.1.0 isal=v2.30.0 isal_crypto=v2.23.0 @@ -27,6 +27,6 @@ ucx=https://github.com/openucx/ucx.git [patch_versions] spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff -fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff +fuse=https://patch-diff.githubusercontent.com/raw/ashleypittman/fused/pull/1.patch mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff From a331daadd9855edc23386cd52cb603b35640f9de Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 20:16:11 +0000 Subject: [PATCH 12/26] DAOS-16686 dfuse: force readdir plus for all cases temporarily Temporarily force readdir plus for all cases now, which can help to save some lookup RPC for some cases. Though this can be removed once we use object enumeration to replace the normal key enumeration for readdir. Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/dfuse_fuseops.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/client/dfuse/dfuse_fuseops.c b/src/client/dfuse/dfuse_fuseops.c index 980e71ac9af..38f98576740 100644 --- a/src/client/dfuse/dfuse_fuseops.c +++ b/src/client/dfuse/dfuse_fuseops.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -88,7 +88,12 @@ dfuse_fuse_init(void *arg, struct fuse_conn_info *conn) DFUSE_TRA_INFO(dfuse_info, "kernel readdir cache support compiled in"); conn->want |= FUSE_CAP_READDIRPLUS; - conn->want |= FUSE_CAP_READDIRPLUS_AUTO; + /* Temporarily force readdir plus for all cases now, which can + * help to save some lookup RPC for some cases. Though this can be + * removed once we use object enumeration to replace the normal key + * enumeration for readdir. XXX + */ + conn->want &= ~FUSE_CAP_READDIRPLUS_AUTO; #ifdef FUSE_CAP_CACHE_SYMLINKS conn->want |= FUSE_CAP_CACHE_SYMLINKS; From 3309425fc2db2bcf00cc654327d191ef7acc2b32 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 20:26:20 +0000 Subject: [PATCH 13/26] DAOS-16686 dfuse: read from cache for readahead Read from readahead cache aggressively, and it may need improved later. Required-githooks: true Allow-unstable-test: true Signed-off-by: Di Wang --- src/client/dfuse/ops/read.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 9f03f8faa1c..1deb4ee60eb 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -148,8 +148,10 @@ dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_o * later checks will determine if the file is read to the end. */ oh->doh_linear_read_pos = max(oh->doh_linear_read_pos, position + len); - } else if (oh->doh_linear_read_pos != position) { - DFUSE_TRA_DEBUG(oh, "disabling pre read"); + } else if ((position + len) <= active->readahead->dra_ev->de_readahead_len) { + DFUSE_TRA_DEBUG(oh, "disabling pre read %llu pos %zu != %zu", + (unsigned long long)oh->doh_ie->ie_stat.st_ino, + oh->doh_linear_read_pos, position); return false; } else { oh->doh_linear_read_pos = position + len; From a101c8897975c1d326e4700e686a917ac5ce3e2f Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 22:14:26 +0000 Subject: [PATCH 14/26] DAOS-16686 dfuse: fix style fix style Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/ops/read.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 1deb4ee60eb..0e0fd05aea8 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -663,7 +663,7 @@ dfuse_pre_read_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, ev->de_readahead_len = len; ev->de_req_position = 0; - ev->de_complete_cb = dfuse_cb_pre_read_complete; + ev->de_complete_cb = dfuse_cb_pre_read_complete; if (active->readahead == NULL) { int rc; @@ -705,7 +705,7 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, struct d struct dfuse_eq *eqt; int rc; - eqt = pick_eqt(dfuse_info); + eqt = pick_eqt(dfuse_info); if (oh->doh_obj == NULL) { /** duplicate the file handle for the fuse handle */ From 463edf3640957c133e770ee2df8ff4518c3be615 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 22:14:26 +0000 Subject: [PATCH 15/26] DAOS-16686 dfuse: fix style fix style Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/ops/read.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 1deb4ee60eb..0e0fd05aea8 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -663,7 +663,7 @@ dfuse_pre_read_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, ev->de_readahead_len = len; ev->de_req_position = 0; - ev->de_complete_cb = dfuse_cb_pre_read_complete; + ev->de_complete_cb = dfuse_cb_pre_read_complete; if (active->readahead == NULL) { int rc; @@ -705,7 +705,7 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, struct d struct dfuse_eq *eqt; int rc; - eqt = pick_eqt(dfuse_info); + eqt = pick_eqt(dfuse_info); if (oh->doh_obj == NULL) { /** duplicate the file handle for the fuse handle */ From 683d3175340588394c4c6675d2827384d8d3b27f Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 18 Dec 2024 23:58:34 +0000 Subject: [PATCH 16/26] DAOS-16686 dfuse: revert chan patch Revert chan patch, since building RPM will fail with this patch. Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/dfuse_thread.c | 9 +++------ utils/build.config | 4 ++-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/client/dfuse/dfuse_thread.c b/src/client/dfuse/dfuse_thread.c index d8de0ab3fe9..4e06e4335ce 100644 --- a/src/client/dfuse/dfuse_thread.c +++ b/src/client/dfuse/dfuse_thread.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2020-2024 Intel Corporation. + * (C) Copyright 2020-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -34,11 +34,10 @@ dfuse_do_work(void *arg) struct dfuse_thread *dt = arg; struct dfuse_tm *dtm = dt->dt_tm; int rc; - struct fuse_chan *chan = fuse_clone_chan(dtm->tm_se); while (!fuse_session_exited(dtm->tm_se)) { pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); - rc = fuse_session_receive_buf_chan(dtm->tm_se, &dt->dt_fbuf, chan); + rc = fuse_session_receive_buf(dtm->tm_se, &dt->dt_fbuf); pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); if (rc == -EINTR) continue; @@ -53,11 +52,9 @@ dfuse_do_work(void *arg) if (atomic_load_relaxed(&dtm->tm_exit)) return NULL; - fuse_session_process_buf_chan(dtm->tm_se, &dt->dt_fbuf, chan); + fuse_session_process_buf(dtm->tm_se, &dt->dt_fbuf); } - fuse_chan_put(chan); - sem_post(&dtm->tm_finish); return NULL; diff --git a/utils/build.config b/utils/build.config index 6313639fc29..a14ad039c15 100644 --- a/utils/build.config +++ b/utils/build.config @@ -3,7 +3,7 @@ component=daos [commit_versions] argobots=v1.1 -fuse=a6a219f5344a5c09cec34416818342ac220a0df2 +fuse=fuse-3.16.2 pmdk=2.1.0 isal=v2.30.0 isal_crypto=v2.23.0 @@ -27,6 +27,6 @@ ucx=https://github.com/openucx/ucx.git [patch_versions] spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff -fuse=https://patch-diff.githubusercontent.com/raw/ashleypittman/fused/pull/1.patch +fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff From ede7e645b40f2e0167031103e4d9b656266aa678 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Sat, 21 Dec 2024 07:24:37 +0000 Subject: [PATCH 17/26] DAOS-16686 dfuse: various fixes Revert open optimize patch, which might cause IL test failure. Various fixes for chunk read to make it wait for inflight read as well. fix data corruption. Run-GHA: true Skip-func-hw-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/dfuse.h | 2 - src/client/dfuse/ops/open.c | 12 +++-- src/client/dfuse/ops/read.c | 96 +++++++++++++++++++----------------- src/client/dfuse/ops/write.c | 7 --- utils/node_local_test.py | 4 +- 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 223914f8442..a6e7152fbfc 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -207,8 +207,6 @@ struct dfuse_obj_hdl { bool doh_evict_on_close; /* the handle is doing readhead for the moment */ bool doh_readahead_inflight; - - int doh_flags; }; /* Readdir support. diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 4c0ca184fb3..384a33bcea6 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -45,11 +45,15 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) LOG_FLAGS(ie, fi->flags); flags = fi->flags; - oh->doh_flags = flags; if (flags & O_APPEND) flags &= ~O_APPEND; + /** duplicate the file handle for the fuse handle */ + rc = dfs_dup(ie->ie_dfs->dfs_ns, ie->ie_obj, flags, &oh->doh_obj); + if (rc) + D_GOTO(err, rc); + if ((fi->flags & O_ACCMODE) != O_RDONLY) oh->doh_writeable = true; @@ -155,7 +159,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct dfuse_info *dfuse_info = fuse_req_userdata(req); struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; struct dfuse_inode_entry *ie = NULL; - int rc = 0; + int rc; uint32_t il_calls; /* Perform the opposite of what the ioctl call does, always change the open handle count @@ -222,9 +226,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) active_oh_decref(dfuse_info, oh); - if (oh->doh_obj != NULL) - rc = dfs_release(oh->doh_obj); - + rc = dfs_release(oh->doh_obj); if (rc == 0) { DFUSE_REPLY_ZERO_OH(oh, req); } else { diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 0e0fd05aea8..99680f6b373 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -48,7 +48,7 @@ cb_read_helper(struct dfuse_event *ev, void *buff) /* read slave complete */ static void -dfuse_cb_slave_read_complete(struct dfuse_event *ev) +dfuse_cb_slave_list_read_complete(struct dfuse_event *ev) { struct dfuse_event *evs, *evn; d_list_t cblist; @@ -66,20 +66,22 @@ dfuse_cb_slave_read_complete(struct dfuse_event *ev) d_list_del(&evs->de_read_list); evs->de_len = min(ev->de_len, evs->de_req_len); evs->de_ev.ev_error = ev->de_ev.ev_error; + DFUSE_IE_STAT_ADD(ev->de_oh->doh_ie, DS_PRE_READ); D_ASSERT(evs->de_req_position >= ev->de_req_position); - cb_read_helper(evs, buf + (evs->de_req_position - ev->de_req_position)); - d_slab_restock(evs->de_eqt->de_read_slab); - d_slab_release(evs->de_eqt->de_read_slab, evs); + memcpy(evs->de_iov.iov_buf, buf + (evs->de_req_position - ev->de_req_position), + evs->de_len); + evs->de_complete_cb(evs); } } static void dfuse_cb_read_complete(struct dfuse_event *ev) { - char *buf = ev->de_iov.iov_buf; + /* check slave list first */ + dfuse_cb_slave_list_read_complete(ev); - dfuse_cb_slave_read_complete(ev); - cb_read_helper(ev, buf); + /* Then complete itself */ + cb_read_helper(ev, ev->de_iov.iov_buf); d_slab_restock(ev->de_eqt->de_read_slab); d_slab_release(ev->de_eqt->de_read_slab, ev); } @@ -177,6 +179,36 @@ pick_eqt(struct dfuse_info *dfuse_info) return &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count]; } +/** + * Check for open matching reads, if there are multiple readers of the same file offset + * then chain future requests off the first one to avoid extra network round-trips. This + * can and does happen even with caching enabled if there are multiple client processes. + * + * Return true if there are inflight reads can satisfy the read, otherwise return false. + */ +static bool +check_inflight_fetch(struct active_inode *active, struct dfuse_event *ev) +{ + struct dfuse_event *evc; + + if (active == NULL) + return false; + + D_SPIN_LOCK(&active->lock); + d_list_for_each_entry(evc, &active->open_reads, de_read_list) { + if (ev->de_req_position >= evc->de_req_position && + ev->de_req_len <= evc->de_req_len) { + d_list_add(&ev->de_read_list, &evc->de_read_slaves); + D_SPIN_UNLOCK(&active->lock); + return true; + } + } + d_list_add_tail(&ev->de_read_list, &active->open_reads); + D_SPIN_UNLOCK(&active->lock); + + return false; +} + /* Chunk read and coalescing * * This code attempts to predict application and kernel I/O patterns and preemptively read file @@ -344,17 +376,23 @@ chunk_fetch(fuse_req_t req, struct dfuse_obj_hdl *oh, struct read_chunk_data *cd ev->de_sgl.sg_nr = 1; ev->de_len = 0; ev->de_complete_cb = chunk_cb; + ev->de_req_len = CHUNK_SIZE; + ev->de_req_position = position; cd->ev = ev; cd->eqt = eqt; cd->reqs[slot] = req; cd->ohs[slot] = oh; + if (check_inflight_fetch(ie->ie_active, ev)) + goto out; + rc = dfs_read(ie->ie_dfs->dfs_ns, ie->ie_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) goto err; +out: /* Send a message to the async thread to wake it up and poll for events */ sem_post(&eqt->de_sem); @@ -496,8 +534,9 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct DFUSE_TRA_DEBUG(oh, "Returning EOF early without round trip %#zx", position); oh->doh_linear_read_eof = false; - if (active->readahead) + if (active->readahead) { DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); + } DFUSE_REPLY_BUFQ(oh, req, NULL, 0); return; } @@ -549,38 +588,10 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct } ev->de_complete_cb = dfuse_cb_read_complete; - DFUSE_IE_WFLUSH(oh->doh_ie); - /* Check for open matching reads, if there are multiple readers of the same file offset - * then chain future requests off the first one to avoid extra network round-trips. This - * can and does happen even with caching enabled if there are multiple client processes. - */ - D_SPIN_LOCK(&active->lock); - { - struct dfuse_event *evc; - - d_list_for_each_entry(evc, &active->open_reads, de_read_list) { - if (ev->de_req_position >= evc->de_req_position && - ev->de_req_len <= evc->de_req_len) { - d_list_add(&ev->de_read_list, &evc->de_read_slaves); - D_SPIN_UNLOCK(&active->lock); - goto out; - } - } - d_list_add_tail(&ev->de_read_list, &active->open_reads); - } - D_SPIN_UNLOCK(&active->lock); - - if (oh->doh_obj == NULL) { - /** duplicate the file handle for the fuse handle */ - rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); - if (rc) { - ev->de_ev.ev_error = rc; - dfuse_cb_read_complete(ev); - return; - } - } + if (check_inflight_fetch(active, ev)) + goto out; rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) { @@ -638,7 +649,7 @@ dfuse_cb_pre_read_complete(struct dfuse_event *ev) pre_read_mark_done(active); ev->de_oh->doh_readahead_inflight = 0; - dfuse_cb_slave_read_complete(ev); + dfuse_cb_slave_list_read_complete(ev); /* Drop the extra ref on active, the file could be closed before this read completes */ active_ie_decref(dfuse_info, ie); } @@ -707,13 +718,6 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, struct d eqt = pick_eqt(dfuse_info); - if (oh->doh_obj == NULL) { - /** duplicate the file handle for the fuse handle */ - rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); - if (rc) - D_GOTO(err, rc); - } - ev->de_oh = oh; ev->de_di = dfuse_info; diff --git a/src/client/dfuse/ops/write.c b/src/client/dfuse/ops/write.c index f0e89bf9ecf..943f9b75e78 100644 --- a/src/client/dfuse/ops/write.c +++ b/src/client/dfuse/ops/write.c @@ -101,13 +101,6 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p if (len + position > oh->doh_ie->ie_stat.st_size) oh->doh_ie->ie_stat.st_size = len + position; - if (oh->doh_obj == NULL) { - /** duplicate the file handle for the fuse handle */ - rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); - if (rc) - D_GOTO(err, rc); - } - rc = dfs_write(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_ev); if (rc != 0) D_GOTO(err, rc); diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 950c1a6dcf5..b2092a29c51 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2374,11 +2374,11 @@ def test_pre_read(self): assert res['statistics']['pre_read'] == 0, res os.close(fd) - # Open a MB file. This reads 8 128k chunks and 1 EOF. + # Open a MB file. This reads 8 128k chunks. with open(join(dfuse.dir, 'file3'), 'r') as fd: data3 = fd.read() res = dfuse.check_usage(old=res) - assert res['statistics']['pre_read'] == 9, res + assert res['statistics']['pre_read'] == 8, res # Open a (1MB-1) file. This reads 8 128k chunks, the last is truncated. There is no EOF # returned by dfuse here, just a truncated read but I assume python is interpreting a From 40755d796a0f52af9ade3553c153ee0abc1fcd59 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Sat, 21 Dec 2024 07:24:37 +0000 Subject: [PATCH 18/26] DAOS-16686 dfuse: various fixes Revert open optimize patch, which might cause IL test failure. Various fixes for chunk read to make it wait for inflight read as well. fix data corruption. Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/dfuse.h | 2 - src/client/dfuse/ops/open.c | 12 +++-- src/client/dfuse/ops/read.c | 96 +++++++++++++++++++----------------- src/client/dfuse/ops/write.c | 7 --- utils/node_local_test.py | 4 +- 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index 223914f8442..a6e7152fbfc 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -207,8 +207,6 @@ struct dfuse_obj_hdl { bool doh_evict_on_close; /* the handle is doing readhead for the moment */ bool doh_readahead_inflight; - - int doh_flags; }; /* Readdir support. diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 4c0ca184fb3..384a33bcea6 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -45,11 +45,15 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) LOG_FLAGS(ie, fi->flags); flags = fi->flags; - oh->doh_flags = flags; if (flags & O_APPEND) flags &= ~O_APPEND; + /** duplicate the file handle for the fuse handle */ + rc = dfs_dup(ie->ie_dfs->dfs_ns, ie->ie_obj, flags, &oh->doh_obj); + if (rc) + D_GOTO(err, rc); + if ((fi->flags & O_ACCMODE) != O_RDONLY) oh->doh_writeable = true; @@ -155,7 +159,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) struct dfuse_info *dfuse_info = fuse_req_userdata(req); struct dfuse_obj_hdl *oh = (struct dfuse_obj_hdl *)fi->fh; struct dfuse_inode_entry *ie = NULL; - int rc = 0; + int rc; uint32_t il_calls; /* Perform the opposite of what the ioctl call does, always change the open handle count @@ -222,9 +226,7 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) active_oh_decref(dfuse_info, oh); - if (oh->doh_obj != NULL) - rc = dfs_release(oh->doh_obj); - + rc = dfs_release(oh->doh_obj); if (rc == 0) { DFUSE_REPLY_ZERO_OH(oh, req); } else { diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 0e0fd05aea8..99680f6b373 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -48,7 +48,7 @@ cb_read_helper(struct dfuse_event *ev, void *buff) /* read slave complete */ static void -dfuse_cb_slave_read_complete(struct dfuse_event *ev) +dfuse_cb_slave_list_read_complete(struct dfuse_event *ev) { struct dfuse_event *evs, *evn; d_list_t cblist; @@ -66,20 +66,22 @@ dfuse_cb_slave_read_complete(struct dfuse_event *ev) d_list_del(&evs->de_read_list); evs->de_len = min(ev->de_len, evs->de_req_len); evs->de_ev.ev_error = ev->de_ev.ev_error; + DFUSE_IE_STAT_ADD(ev->de_oh->doh_ie, DS_PRE_READ); D_ASSERT(evs->de_req_position >= ev->de_req_position); - cb_read_helper(evs, buf + (evs->de_req_position - ev->de_req_position)); - d_slab_restock(evs->de_eqt->de_read_slab); - d_slab_release(evs->de_eqt->de_read_slab, evs); + memcpy(evs->de_iov.iov_buf, buf + (evs->de_req_position - ev->de_req_position), + evs->de_len); + evs->de_complete_cb(evs); } } static void dfuse_cb_read_complete(struct dfuse_event *ev) { - char *buf = ev->de_iov.iov_buf; + /* check slave list first */ + dfuse_cb_slave_list_read_complete(ev); - dfuse_cb_slave_read_complete(ev); - cb_read_helper(ev, buf); + /* Then complete itself */ + cb_read_helper(ev, ev->de_iov.iov_buf); d_slab_restock(ev->de_eqt->de_read_slab); d_slab_release(ev->de_eqt->de_read_slab, ev); } @@ -177,6 +179,36 @@ pick_eqt(struct dfuse_info *dfuse_info) return &dfuse_info->di_eqt[eqt_idx % dfuse_info->di_eq_count]; } +/** + * Check for open matching reads, if there are multiple readers of the same file offset + * then chain future requests off the first one to avoid extra network round-trips. This + * can and does happen even with caching enabled if there are multiple client processes. + * + * Return true if there are inflight reads can satisfy the read, otherwise return false. + */ +static bool +check_inflight_fetch(struct active_inode *active, struct dfuse_event *ev) +{ + struct dfuse_event *evc; + + if (active == NULL) + return false; + + D_SPIN_LOCK(&active->lock); + d_list_for_each_entry(evc, &active->open_reads, de_read_list) { + if (ev->de_req_position >= evc->de_req_position && + ev->de_req_len <= evc->de_req_len) { + d_list_add(&ev->de_read_list, &evc->de_read_slaves); + D_SPIN_UNLOCK(&active->lock); + return true; + } + } + d_list_add_tail(&ev->de_read_list, &active->open_reads); + D_SPIN_UNLOCK(&active->lock); + + return false; +} + /* Chunk read and coalescing * * This code attempts to predict application and kernel I/O patterns and preemptively read file @@ -344,17 +376,23 @@ chunk_fetch(fuse_req_t req, struct dfuse_obj_hdl *oh, struct read_chunk_data *cd ev->de_sgl.sg_nr = 1; ev->de_len = 0; ev->de_complete_cb = chunk_cb; + ev->de_req_len = CHUNK_SIZE; + ev->de_req_position = position; cd->ev = ev; cd->eqt = eqt; cd->reqs[slot] = req; cd->ohs[slot] = oh; + if (check_inflight_fetch(ie->ie_active, ev)) + goto out; + rc = dfs_read(ie->ie_dfs->dfs_ns, ie->ie_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) goto err; +out: /* Send a message to the async thread to wake it up and poll for events */ sem_post(&eqt->de_sem); @@ -496,8 +534,9 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct DFUSE_TRA_DEBUG(oh, "Returning EOF early without round trip %#zx", position); oh->doh_linear_read_eof = false; - if (active->readahead) + if (active->readahead) { DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); + } DFUSE_REPLY_BUFQ(oh, req, NULL, 0); return; } @@ -549,38 +588,10 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct } ev->de_complete_cb = dfuse_cb_read_complete; - DFUSE_IE_WFLUSH(oh->doh_ie); - /* Check for open matching reads, if there are multiple readers of the same file offset - * then chain future requests off the first one to avoid extra network round-trips. This - * can and does happen even with caching enabled if there are multiple client processes. - */ - D_SPIN_LOCK(&active->lock); - { - struct dfuse_event *evc; - - d_list_for_each_entry(evc, &active->open_reads, de_read_list) { - if (ev->de_req_position >= evc->de_req_position && - ev->de_req_len <= evc->de_req_len) { - d_list_add(&ev->de_read_list, &evc->de_read_slaves); - D_SPIN_UNLOCK(&active->lock); - goto out; - } - } - d_list_add_tail(&ev->de_read_list, &active->open_reads); - } - D_SPIN_UNLOCK(&active->lock); - - if (oh->doh_obj == NULL) { - /** duplicate the file handle for the fuse handle */ - rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); - if (rc) { - ev->de_ev.ev_error = rc; - dfuse_cb_read_complete(ev); - return; - } - } + if (check_inflight_fetch(active, ev)) + goto out; rc = dfs_read(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_len, &ev->de_ev); if (rc != 0) { @@ -638,7 +649,7 @@ dfuse_cb_pre_read_complete(struct dfuse_event *ev) pre_read_mark_done(active); ev->de_oh->doh_readahead_inflight = 0; - dfuse_cb_slave_read_complete(ev); + dfuse_cb_slave_list_read_complete(ev); /* Drop the extra ref on active, the file could be closed before this read completes */ active_ie_decref(dfuse_info, ie); } @@ -707,13 +718,6 @@ dfuse_pre_read(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh, struct d eqt = pick_eqt(dfuse_info); - if (oh->doh_obj == NULL) { - /** duplicate the file handle for the fuse handle */ - rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); - if (rc) - D_GOTO(err, rc); - } - ev->de_oh = oh; ev->de_di = dfuse_info; diff --git a/src/client/dfuse/ops/write.c b/src/client/dfuse/ops/write.c index f0e89bf9ecf..943f9b75e78 100644 --- a/src/client/dfuse/ops/write.c +++ b/src/client/dfuse/ops/write.c @@ -101,13 +101,6 @@ dfuse_cb_write(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, off_t p if (len + position > oh->doh_ie->ie_stat.st_size) oh->doh_ie->ie_stat.st_size = len + position; - if (oh->doh_obj == NULL) { - /** duplicate the file handle for the fuse handle */ - rc = dfs_dup(oh->doh_dfs, oh->doh_ie->ie_obj, oh->doh_flags, &oh->doh_obj); - if (rc) - D_GOTO(err, rc); - } - rc = dfs_write(oh->doh_dfs, oh->doh_obj, &ev->de_sgl, position, &ev->de_ev); if (rc != 0) D_GOTO(err, rc); diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 950c1a6dcf5..b2092a29c51 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2374,11 +2374,11 @@ def test_pre_read(self): assert res['statistics']['pre_read'] == 0, res os.close(fd) - # Open a MB file. This reads 8 128k chunks and 1 EOF. + # Open a MB file. This reads 8 128k chunks. with open(join(dfuse.dir, 'file3'), 'r') as fd: data3 = fd.read() res = dfuse.check_usage(old=res) - assert res['statistics']['pre_read'] == 9, res + assert res['statistics']['pre_read'] == 8, res # Open a (1MB-1) file. This reads 8 128k chunks, the last is truncated. There is no EOF # returned by dfuse here, just a truncated read but I assume python is interpreting a From 5a9b03b9791cabcff5c11c98e35b7652836261aa Mon Sep 17 00:00:00 2001 From: Di Wang Date: Mon, 23 Dec 2024 01:55:02 +0000 Subject: [PATCH 19/26] DAOS-16686 dfuse: a few fixes for cleanup a few fixes for error cleanup path. Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/dfuse.h | 3 --- src/client/dfuse/file.c | 19 ------------------- src/client/dfuse/ops/open.c | 14 ++++++++------ src/client/dfuse/ops/read.c | 20 +++++++++++++++++++- 4 files changed, 27 insertions(+), 29 deletions(-) diff --git a/src/client/dfuse/dfuse.h b/src/client/dfuse/dfuse.h index a6e7152fbfc..1745d104e71 100644 --- a/src/client/dfuse/dfuse.h +++ b/src/client/dfuse/dfuse.h @@ -1046,9 +1046,6 @@ struct active_inode { int active_ie_init(struct dfuse_inode_entry *ie); -int -active_ie_readahead_init(struct dfuse_inode_entry *ie); - /* Mark a oh as closing and drop the ref on inode active */ void active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh); diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index 439359a422c..aca16e4c71d 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -47,25 +47,6 @@ active_ie_init(struct dfuse_inode_entry *ie) return rc; } -int -active_ie_readahead_init(struct dfuse_inode_entry *ie) -{ - struct active_inode *ie_active = ie->ie_active; - - D_ASSERT(ie_active != NULL); - if (ie_active->readahead != NULL) - return 0; - - D_ALLOC_PTR(ie_active->readahead); - if (ie_active->readahead == NULL) - return -DER_NOMEM; - - D_INIT_LIST_HEAD(&ie_active->readahead->req_list); - atomic_fetch_add_relaxed(&ie->ie_open_count, 1); - - return 0; -} - static void ah_free(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie) { diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 384a33bcea6..5e85e870498 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -89,7 +89,6 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) ie->ie_stat.st_size <= DFUSE_MAX_PRE_READ) || (ie->ie_stat.st_size > 0 && ie->ie_stat.st_size <= DFUSE_MAX_PRE_READ_ONCE)) { - preread = true; /* Add the read extent to the list to make sure the following read * will check the readahead list first. */ @@ -98,6 +97,9 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) D_SPIN_UNLOCK(&ie->ie_active->lock); D_GOTO(decref, rc); } + /* Descreased in pre_read_complete_cb() */ + preread = true; + atomic_fetch_add_relaxed(&ie->ie_open_count, 1); oh->doh_readahead_inflight = 1; } D_SPIN_UNLOCK(&ie->ie_active->lock); @@ -127,7 +129,7 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) if (fi->flags & O_TRUNC) { rc = dfs_punch(ie->ie_dfs->dfs_ns, ie->ie_obj, 0, DFS_MAX_FSIZE); if (rc) - D_GOTO(ie_decref, rc); + D_GOTO(preread_abort, rc); dfuse_dcache_evict(oh->doh_ie); } @@ -140,8 +142,9 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) dfuse_pre_read(dfuse_info, oh, ev); return; -ie_decref: - dfuse_pre_read_abort(dfuse_info, oh, ev, rc); +preread_abort: + if (preread) + dfuse_pre_read_abort(dfuse_info, oh, ev, rc); decref: active_ie_decref(dfuse_info, ie); err: @@ -204,9 +207,8 @@ dfuse_cb_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) } } DFUSE_TRA_DEBUG(oh, "il_calls %d, caching %d,", il_calls, oh->doh_caching); - if (il_calls != 0) { + if (il_calls != 0) atomic_fetch_sub_relaxed(&oh->doh_ie->ie_il_count, 1); - } /* Wait inflight readahead RPC finished before release */ if (oh->doh_ie->ie_active != NULL) { diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 99680f6b373..8d1a84a44a5 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -647,11 +647,29 @@ dfuse_cb_pre_read_complete(struct dfuse_event *ev) active->readahead->dra_ev = NULL; } pre_read_mark_done(active); - ev->de_oh->doh_readahead_inflight = 0; dfuse_cb_slave_list_read_complete(ev); /* Drop the extra ref on active, the file could be closed before this read completes */ active_ie_decref(dfuse_info, ie); + ev->de_oh->doh_readahead_inflight = 0; +} + +static int +active_ie_readahead_init(struct dfuse_inode_entry *ie) +{ + struct active_inode *ie_active = ie->ie_active; + + D_ASSERT(ie_active != NULL); + if (ie_active->readahead != NULL) + return 0; + + D_ALLOC_PTR(ie_active->readahead); + if (ie_active->readahead == NULL) + return -DER_NOMEM; + + D_INIT_LIST_HEAD(&ie_active->readahead->req_list); + + return 0; } int From 884096415d81fc220d199081205ab347bcdfe078 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Mon, 23 Dec 2024 16:21:53 +0000 Subject: [PATCH 20/26] DAOS-16686 dfuse: fix style fix style Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/ops/read.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 8d1a84a44a5..890dc32464b 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -370,12 +370,12 @@ chunk_fetch(fuse_req_t req, struct dfuse_obj_hdl *oh, struct read_chunk_data *cd return false; } - ev->de_iov.iov_len = CHUNK_SIZE; - ev->de_req = req; - ev->de_cd = cd; - ev->de_sgl.sg_nr = 1; - ev->de_len = 0; - ev->de_complete_cb = chunk_cb; + ev->de_iov.iov_len = CHUNK_SIZE; + ev->de_req = req; + ev->de_cd = cd; + ev->de_sgl.sg_nr = 1; + ev->de_len = 0; + ev->de_complete_cb = chunk_cb; ev->de_req_len = CHUNK_SIZE; ev->de_req_position = position; @@ -678,7 +678,7 @@ dfuse_pre_read_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, { struct active_inode *active = ie->ie_active; struct dfuse_eq *eqt; - struct dfuse_event *ev; + struct dfuse_event *ev; size_t len = ie->ie_stat.st_size; eqt = pick_eqt(dfuse_info); From 81f5ff4c0deebaead65955e6f88271c844c9d340 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Tue, 24 Dec 2024 21:15:51 +0000 Subject: [PATCH 21/26] DAOS-16686 dfuse: fix hang for chunk read fix hang during chunk read Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/file.c | 4 ++++ src/client/dfuse/ops/read.c | 41 ++++++++++++++++++++++++++----------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index aca16e4c71d..8f95a658d68 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -91,6 +91,10 @@ active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) if (read_chunk_close(oh->doh_ie)) oh->doh_linear_read = true; + /* Invalid readahead cache */ + if (oh->doh_ie->ie_active->readahead) + oh->doh_ie->ie_active->readahead->dra_ev = NULL; + /* Do not set linear read in the case where there's no reads or writes, this could be * simple open/close calls but it could also be cache use so leave the setting unchanged * in this case. diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 890dc32464b..9bd6009d95d 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -48,25 +48,28 @@ cb_read_helper(struct dfuse_event *ev, void *buff) /* read slave complete */ static void -dfuse_cb_slave_list_read_complete(struct dfuse_event *ev) +dfuse_cb_slave_list_read_complete(struct dfuse_event *ev, struct dfuse_inode_entry *ie) { struct dfuse_event *evs, *evn; d_list_t cblist; char *buf = ev->de_iov.iov_buf; + struct active_inode *ia = ie->ie_active; D_INIT_LIST_HEAD(&cblist); - D_SPIN_LOCK(&ev->de_oh->doh_ie->ie_active->lock); + D_SPIN_LOCK(&ia->lock); d_list_del(&ev->de_read_list); - d_list_for_each_entry_safe(evs, evn, &ev->de_read_slaves, de_read_list) + d_list_for_each_entry_safe(evs, evn, &ev->de_read_slaves, de_read_list) { d_list_move(&evs->de_read_list, &cblist); - D_SPIN_UNLOCK(&ev->de_oh->doh_ie->ie_active->lock); + } + D_SPIN_UNLOCK(&ia->lock); d_list_for_each_entry(evs, &cblist, de_read_list) { DFUSE_TRA_DEBUG(ev->de_oh, "concurrent network read %p", evs->de_oh); d_list_del(&evs->de_read_list); evs->de_len = min(ev->de_len, evs->de_req_len); evs->de_ev.ev_error = ev->de_ev.ev_error; - DFUSE_IE_STAT_ADD(ev->de_oh->doh_ie, DS_PRE_READ); + if (ia->readahead && ia->readahead->complete) + DFUSE_IE_STAT_ADD(ie, DS_PRE_READ); D_ASSERT(evs->de_req_position >= ev->de_req_position); memcpy(evs->de_iov.iov_buf, buf + (evs->de_req_position - ev->de_req_position), evs->de_len); @@ -78,7 +81,7 @@ static void dfuse_cb_read_complete(struct dfuse_event *ev) { /* check slave list first */ - dfuse_cb_slave_list_read_complete(ev); + dfuse_cb_slave_list_read_complete(ev, ev->de_oh->doh_ie); /* Then complete itself */ cb_read_helper(ev, ev->de_iov.iov_buf); @@ -233,7 +236,7 @@ check_inflight_fetch(struct active_inode *active, struct dfuse_event *ev) struct read_chunk_data { struct dfuse_event *ev; - struct active_inode *ia; + struct dfuse_inode_entry *ie; fuse_req_t reqs[8]; struct dfuse_obj_hdl *ohs[8]; d_list_t list; @@ -288,7 +291,8 @@ static void chunk_cb(struct dfuse_event *ev) { struct read_chunk_data *cd = ev->de_cd; - struct active_inode *ia = cd->ia; + struct dfuse_inode_entry *ie = cd->ie; + struct active_inode *ia = ie->ie_active; fuse_req_t req; bool done = false; @@ -300,6 +304,7 @@ chunk_cb(struct dfuse_event *ev) cd->bucket, cd->bucket * CHUNK_SIZE, CHUNK_SIZE, ev->de_len); } + dfuse_cb_slave_list_read_complete(ev, ie); daos_event_fini(&ev->de_ev); do { @@ -455,7 +460,7 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) if (cd == NULL) goto err; - cd->ia = ie->ie_active; + cd->ie = ie; cd->bucket = bucket; submit = true; @@ -497,6 +502,7 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) */ rcb = false; } else { + DFUSE_IE_STAT_ADD(ie, DS_PRE_READ); DFUSE_TRA_DEBUG(oh, "%#zx-%#zx read", position, position + K128 - 1); DFUSE_REPLY_BUFQ(oh, req, ev->de_iov.iov_buf + (slot * K128), K128); @@ -505,6 +511,17 @@ chunk_read(fuse_req_t req, size_t len, off_t position, struct dfuse_obj_hdl *oh) d_slab_release(cd->eqt->de_read_slab, cd->ev); D_FREE(cd); } + } else { + struct dfuse_info *dfuse_info = fuse_req_userdata(req); + struct dfuse_eq *eqt; + + eqt = pick_eqt(dfuse_info); + + /* Send a message to the async thread to wake it up and poll for events */ + sem_post(&eqt->de_sem); + + /* Now ensure there are more descriptors for the next request */ + d_slab_restock(eqt->de_read_slab); } } @@ -534,9 +551,9 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct DFUSE_TRA_DEBUG(oh, "Returning EOF early without round trip %#zx", position); oh->doh_linear_read_eof = false; - if (active->readahead) { + if (active->readahead) DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); - } + DFUSE_REPLY_BUFQ(oh, req, NULL, 0); return; } @@ -648,7 +665,7 @@ dfuse_cb_pre_read_complete(struct dfuse_event *ev) } pre_read_mark_done(active); - dfuse_cb_slave_list_read_complete(ev); + dfuse_cb_slave_list_read_complete(ev, ie); /* Drop the extra ref on active, the file could be closed before this read completes */ active_ie_decref(dfuse_info, ie); ev->de_oh->doh_readahead_inflight = 0; From 9d30f9fe07cd5b99ea7798324e4716d896193121 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Wed, 25 Dec 2024 04:09:04 +0000 Subject: [PATCH 22/26] DAOS-16686 dfuse: fix style fix style Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/ops/open.c | 2 +- src/client/dfuse/ops/read.c | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/client/dfuse/ops/open.c b/src/client/dfuse/ops/open.c index 5e85e870498..c86b57b71ca 100644 --- a/src/client/dfuse/ops/open.c +++ b/src/client/dfuse/ops/open.c @@ -97,7 +97,7 @@ dfuse_cb_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) D_SPIN_UNLOCK(&ie->ie_active->lock); D_GOTO(decref, rc); } - /* Descreased in pre_read_complete_cb() */ + /* Decreased in pre_read_complete_cb() */ preread = true; atomic_fetch_add_relaxed(&ie->ie_open_count, 1); oh->doh_readahead_inflight = 1; diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 9bd6009d95d..edf1b8ad506 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -235,18 +235,18 @@ check_inflight_fetch(struct active_inode *active, struct dfuse_event *ev) #define CHUNK_SIZE (1024 * 1024) struct read_chunk_data { - struct dfuse_event *ev; + struct dfuse_event *ev; struct dfuse_inode_entry *ie; - fuse_req_t reqs[8]; - struct dfuse_obj_hdl *ohs[8]; - d_list_t list; - uint64_t bucket; - struct dfuse_eq *eqt; - int rc; - int entered; - ATOMIC int exited; - bool exiting; - bool complete; + fuse_req_t reqs[8]; + struct dfuse_obj_hdl *ohs[8]; + d_list_t list; + uint64_t bucket; + struct dfuse_eq *eqt; + int rc; + int entered; + ATOMIC int exited; + bool exiting; + bool complete; }; static void @@ -290,11 +290,11 @@ read_chunk_close(struct dfuse_inode_entry *ie) static void chunk_cb(struct dfuse_event *ev) { - struct read_chunk_data *cd = ev->de_cd; + struct read_chunk_data *cd = ev->de_cd; struct dfuse_inode_entry *ie = cd->ie; struct active_inode *ia = ie->ie_active; - fuse_req_t req; - bool done = false; + fuse_req_t req; + bool done = false; cd->rc = ev->de_ev.ev_error; From 67e1e6cecafa2ace30b663ba5c176e83e70c3963 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Sat, 28 Dec 2024 04:47:57 +0000 Subject: [PATCH 23/26] DAOS-16686 dfuse: fix the typo in utils test fix the typo in utils test. Required-githooks: true Signed-off-by: Di Wang --- utils/node_local_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index b2092a29c51..67d86c17986 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2378,7 +2378,7 @@ def test_pre_read(self): with open(join(dfuse.dir, 'file3'), 'r') as fd: data3 = fd.read() res = dfuse.check_usage(old=res) - assert res['statistics']['pre_read'] == 8, res + assert res['statistics']['pre_read'] == 9, res # Open a (1MB-1) file. This reads 8 128k chunks, the last is truncated. There is no EOF # returned by dfuse here, just a truncated read but I assume python is interpreting a From 77643c9c3a18f5b4d8dac49aa513fc6217281ab8 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Sat, 28 Dec 2024 04:47:57 +0000 Subject: [PATCH 24/26] DAOS-16686 dfuse: fix the typo in utils test fix the typo in utils test. Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- utils/node_local_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index b2092a29c51..67d86c17986 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2378,7 +2378,7 @@ def test_pre_read(self): with open(join(dfuse.dir, 'file3'), 'r') as fd: data3 = fd.read() res = dfuse.check_usage(old=res) - assert res['statistics']['pre_read'] == 8, res + assert res['statistics']['pre_read'] == 9, res # Open a (1MB-1) file. This reads 8 128k chunks, the last is truncated. There is no EOF # returned by dfuse here, just a truncated read but I assume python is interpreting a From 9d681a462df295e61cf268ad671ad8c02fa19ece Mon Sep 17 00:00:00 2001 From: Di Wang Date: Sat, 28 Dec 2024 05:40:11 +0000 Subject: [PATCH 25/26] DAOS-16686 dfuse: fix the memory leak fix the memory leak Run-GHA: true Allow-unstable-test: true Required-githooks: true Signed-off-by: Di Wang --- src/client/dfuse/file.c | 7 ++++++- src/client/dfuse/ops/read.c | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/client/dfuse/file.c b/src/client/dfuse/file.c index 8f95a658d68..09fb43fbff0 100644 --- a/src/client/dfuse/file.c +++ b/src/client/dfuse/file.c @@ -92,8 +92,13 @@ active_oh_decref(struct dfuse_info *dfuse_info, struct dfuse_obj_hdl *oh) oh->doh_linear_read = true; /* Invalid readahead cache */ - if (oh->doh_ie->ie_active->readahead) + if (oh->doh_ie->ie_active->readahead && oh->doh_ie->ie_active->readahead->dra_ev) { + struct dfuse_event *ev = oh->doh_ie->ie_active->readahead->dra_ev; + + daos_event_fini(&ev->de_ev); + d_slab_release(ev->de_eqt->de_pre_read_slab, ev); oh->doh_ie->ie_active->readahead->dra_ev = NULL; + } /* Do not set linear read in the case where there's no reads or writes, this could be * simple open/close calls but it could also be cache use so leave the setting unchanged diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index edf1b8ad506..f01391af127 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -715,8 +715,10 @@ dfuse_pre_read_init(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, int rc; rc = active_ie_readahead_init(ie); - if (rc != 0) + if (rc != 0) { + d_slab_release(ev->de_eqt->de_pre_read_slab, ev); return rc; + } } active->readahead->dra_ev = ev; From 651ba04f411e18913510f7b07e14edb9e2caa768 Mon Sep 17 00:00:00 2001 From: Di Wang Date: Sat, 28 Dec 2024 17:47:58 +0000 Subject: [PATCH 26/26] DAOS-16686 dfuse: fix style fix style Required-githooks: true Run-GHA: true Allow-unstable-test: true Signed-off-by: Di Wang --- src/client/dfuse/ops/read.c | 6 +++--- utils/node_local_test.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index f01391af127..afee51d9003 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -50,9 +50,9 @@ cb_read_helper(struct dfuse_event *ev, void *buff) static void dfuse_cb_slave_list_read_complete(struct dfuse_event *ev, struct dfuse_inode_entry *ie) { - struct dfuse_event *evs, *evn; - d_list_t cblist; - char *buf = ev->de_iov.iov_buf; + struct dfuse_event *evs, *evn; + d_list_t cblist; + char *buf = ev->de_iov.iov_buf; struct active_inode *ia = ie->ie_active; D_INIT_LIST_HEAD(&cblist); diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 67d86c17986..950c1a6dcf5 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -2374,7 +2374,7 @@ def test_pre_read(self): assert res['statistics']['pre_read'] == 0, res os.close(fd) - # Open a MB file. This reads 8 128k chunks. + # Open a MB file. This reads 8 128k chunks and 1 EOF. with open(join(dfuse.dir, 'file3'), 'r') as fd: data3 = fd.read() res = dfuse.check_usage(old=res)