From 3bc2bea4d3609111db667c53c603477da3df8cc8 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 11 Nov 2024 18:25:59 -0500 Subject: [PATCH 1/2] L2ARC: Move different stats updates earlier ..., before we make the header or the log block visible to others. It should fix assertion on allocated space going negative if the header is freed once the lock is dropped, while the write is still going. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Fixes #16040 --- module/zfs/arc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 76dc0b19139d..2ece7d67704f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -9287,6 +9287,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* @@ -9298,12 +9306,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | - ARC_FLAG_L2_WRITING); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - l2arc_hdr_arcstats_increment(hdr); boolean_t commit = l2arc_log_blk_insert(dev, hdr); mutex_exit(hash_lock); @@ -9333,7 +9335,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); if (commit) { /* l2ad_hand will be adjusted inside. */ @@ -10585,6 +10586,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) (void) zio_nowait(wzio); dev->l2ad_hand += asize; + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. @@ -10598,7 +10601,6 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); From a824df7eb31c4f1c310d05ee0708722a3f99f68b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 4 Nov 2024 15:27:40 +1100 Subject: [PATCH 2/2] dsl_dataset: put IO-inducing frees on the pool deadlist dsl_free() calls zio_free() to free the block. For most blocks, this simply calls metaslab_free() without doing any IO or putting anything on the IO pipeline. Some blocks however require additional IO to free. This at least includes gang, dedup and cloned blocks. For those, zio_free() will issue a ZIO_TYPE_FREE IO and return. If a huge number of blocks are being freed all at once, it's possible for dsl_dataset_block_kill() to be called millions of time on a single transaction (eg a 2T object of 128K blocks is 16M blocks). If those are all IO-inducing frees, that then becomes 16M FREE IOs placed on the pipeline. At time of writing, a zio_t is 1280 bytes, so for just one 2T object that requires a 20G allocation of resident memory from the zio_cache. If that can't be satisfied by the kernel, an out-of-memory condition is raised. This would be better handled by improving the cases that the dmu_tx_assign() throttle will handle, or by reducing the overheads required by the IO pipeline, or with a better central facility for freeing blocks. For now, we simply check for the cases that would cause zio_free() to create a FREE IO, and instead put the block on the pool's freelist. This is the same place that blocks from destroyed datasets go, and the async destroy machinery will automatically see them and trickle them out as normal. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- module/zfs/dsl_dataset.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 2248f644bee7..4712addf81be 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -68,6 +68,7 @@ #include #include #include +#include /* * The SPA supports block sizes up to 16MB. However, very large blocks @@ -289,8 +290,26 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); - dsl_free(tx->tx_pool, tx->tx_txg, bp); + /* + * Put blocks that would create IO on the pool's deadlist for + * dsl_process_async_destroys() to find. This is to prevent + * zio_free() from creating a ZIO_TYPE_FREE IO for them, which + * are very heavy and can lead to out-of-memory conditions if + * something tries to free millions of blocks on the same txg. + */ + boolean_t defer = spa_version(spa) >= SPA_VERSION_DEADLISTS && + (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) || + brt_maybe_exists(spa, bp)); + + if (defer) { + dprintf_bp(bp, "putting on free list: %s", ""); + bpobj_enqueue(&ds->ds_dir->dd_pool->dp_free_bpobj, + bp, B_FALSE, tx); + } else { + dprintf_bp(bp, "freeing ds=%llu", + (u_longlong_t)ds->ds_object); + dsl_free(tx->tx_pool, tx->tx_txg, bp); + } mutex_enter(&ds->ds_lock); ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || @@ -298,9 +317,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); + dsl_dir_diduse_transfer_space(ds->ds_dir, delta, -compressed, -uncompressed, -used, DD_USED_REFRSRV, DD_USED_HEAD, tx); + + if (defer) + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, compressed, uncompressed, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) {