From fd1252a8f5950b12ebcbac638e95942ec41a184b Mon Sep 17 00:00:00 2001 From: Don Brady Date: Thu, 9 Jan 2025 20:59:47 +0000 Subject: [PATCH] More changes from recent feedback also added test from Tony Signed-off-by: Don Brady --- include/sys/fs/zfs.h | 2 +- lib/libzfs/libzfs.abi | 2 +- lib/libzfs/libzfs_pool.c | 2 +- man/man7/vdevprops.7 | 16 ++-- module/zcommon/zpool_prop.c | 6 +- module/zfs/vdev.c | 2 +- module/zfs/vdev_raidz.c | 60 +++++------- tests/runfiles/common.run | 4 + tests/zfs-tests/include/libtest.shlib | 10 ++ tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../functional/events/slow_vdev_sit_out.ksh | 93 +++++++++++++++++++ 12 files changed, 149 insertions(+), 50 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 547cfb72d1c4..102ffef016c5 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -379,7 +379,7 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, - VDEV_PROP_SIT_OUT_READS, + VDEV_PROP_SIT_OUT, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f2aef8754460..4f5dd1c983fc 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5917,7 +5917,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ec30d2a6ddd3..dc0f0c53730c 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5478,7 +5478,7 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, /* Only use if provided by the RAIDZ VDEV above */ if (prop == VDEV_PROP_RAIDZ_EXPANDING) return (ENOENT); - if (prop == VDEV_PROP_SIT_OUT_READS) + if (prop == VDEV_PROP_SIT_OUT) return (ENOENT); } if (vdev_prop_index_to_string(prop, intval, diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 844864518c1e..229715c35d92 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -104,19 +104,23 @@ Comma separated list of children of this vdev The number of children belonging to this vdev .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors The number of errors of each type encountered by this vdev -.It Sy sit_out_reads +.It Sy sit_out True when a slow disk outlier was detected and the vdev is currently in a sit out state. While sitting out, the vdev will not participate in normal reads, instead its data will be reconstructed as needed from parity. .It Sy slow_ios -The number of slow I/Os encountered by this vdev, -These represent I/O operations that didn't complete in +This indicates the number of slow I/O operations encountered by this vdev. +A slow I/O is defined as an operation that did not complete within the .Sy zio_slow_io_ms -milliseconds +threshold in milliseconds .Pq Sy 30000 No by default . -Can also be incremented when a vdev was determined to be a raidz leaf latency -outlier. +For +.Sy RAIDZ +and +.Sy DRAID +configurations, this value also represents the number of times the vdev was +identified as an outlier and excluded from participating in read I/O operations. .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops The number of I/O operations of each type performed by this vdev .It Xo diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 461fc7faefe2..ef932ec4a0f6 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -466,9 +466,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); - zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0, - PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS", - boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0, + PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table, + sfeatures); zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 003caceb3328..f03200cdf86c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -6363,7 +6363,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) ZPROP_SRC_NONE); } continue; - case VDEV_PROP_SIT_OUT_READS: + case VDEV_PROP_SIT_OUT: /* Only expose this for a draid or raidz leaf */ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_top != NULL && diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 801707042a5c..599e68360833 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -2302,11 +2302,11 @@ vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) if (raidz_read_sit_out_secs == 0) return (B_FALSE); - /* Avoid skipping a data column read when resilvering */ - if (io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) + /* Avoid skipping a data column read when scrubbing */ + if (io_flags & ZIO_FLAG_SCRUB) return (B_FALSE); - return (vd->vdev_read_sit_out_expire >= gethrtime()); + return (vd->vdev_read_sit_out_expire >= gethrestime_sec()); } /* @@ -2318,10 +2318,10 @@ vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags) static uint64_t calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) { /* - * Scale using 16 bits with an effective alpha of 0.50 + * Scale using 8 bits with an effective alpha of 0.25 */ - const uint64_t scale = 16; - const uint64_t alpha = 32768; + const uint64_t scale = 8; + const uint64_t alpha = 64; return (((alpha * latest_value) + (((1ULL << scale) - alpha) * previous_ewma)) >> scale); @@ -2874,10 +2874,14 @@ latency_quartiles_fence(const uint64_t *data, size_t n) else q3 = latency_median_value(&data[(n+1) >> 1], n>>1); - uint64_t iqr = q3 - q1; - uint64_t fence = q3 + iqr; + /* + * To avoid detecting false positive outliers when N is small and + * and the latencies values are very close, make sure the fence + * is at least 25% larger than Q1. + */ + uint64_t iqr = MAX(q3 - q1, q1 >> 3); - return (fence); + return (q3 + (iqr << 1)); } #define LAT_SAMPLES_STACK 64 @@ -2908,14 +2912,6 @@ vdev_child_slow_outlier(zio_t *zio) if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN) return; - spa_t *spa = zio->io_spa; - if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || - spa_load_state(spa) == SPA_LOAD_RECOVER || - (spa_load_state(spa) != SPA_LOAD_NONE && - spa->spa_last_open_failed)) { - return; - } - hrtime_t now = gethrtime(); uint64_t last = atomic_load_64(&vd->vdev_last_latency_check); @@ -2934,14 +2930,12 @@ vdev_child_slow_outlier(zio_t *zio) lat_data = &data[0]; uint64_t max = 0; - uint64_t max_outier_count = 0; vdev_t *svd = NULL; /* suspect vdev */ - vdev_t *ovd = NULL; /* largest outlier vdev */ for (int c = 0; c < samples; c++) { vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_read_sit_out_expire != 0) { - if (cvd->vdev_read_sit_out_expire < gethrtime()) { + if (cvd->vdev_read_sit_out_expire < gethrestime_sec()) { /* * Done with our sit out, wait for new outlier * to emerge. @@ -2965,12 +2959,6 @@ vdev_child_slow_outlier(zio_t *zio) max = lat_data[c]; svd = cvd; } - - uint64_t count = atomic_load_64(&cvd->vdev_outlier_count); - if (count > max_outier_count) { - max_outier_count = count; - ovd = cvd; - } } qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare); @@ -2982,28 +2970,26 @@ vdev_child_slow_outlier(zio_t *zio) * higher than peers outlier count will be considered * a slow disk. */ - if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) > - LAT_OUTLIER_LIMIT && svd == ovd && - svd->vdev_read_sit_out_expire == 0) { + if (++svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) { + ASSERT0(svd->vdev_read_sit_out_expire); /* * Begin a sit out period for this slow drive */ - svd->vdev_read_sit_out_expire = gethrtime() + - SEC2NSEC(raidz_read_sit_out_secs); + svd->vdev_read_sit_out_expire = gethrestime_sec() + + raidz_read_sit_out_secs; /* count each slow io period */ mutex_enter(&svd->vdev_stat_lock); svd->vdev_stat.vs_slow_ios++; mutex_exit(&svd->vdev_stat_lock); - (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, spa, svd, - NULL, NULL, 0); + (void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, svd, NULL, NULL, 0); vdev_dbgmsg(svd, "begin read sit out for %d secs", (int)raidz_read_sit_out_secs); - for (int c = 0; c < vd->vdev_children; c++) { - atomic_store_64( - &vd->vdev_child[c]->vdev_outlier_count, 0); - } + + for (int c = 0; c < vd->vdev_children; c++) + vd->vdev_child[c]->vdev_outlier_count = 0; } } out: diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 8a4a4b0f5cb8..8c954dba2d18 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -702,6 +702,10 @@ tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', 'dio_unaligned_block', 'dio_unaligned_filesize'] tags = ['functional', 'direct'] +[tests/functional/events] +tests = ['slow_vdev_sit_out'] +tags = ['functional', 'events'] + [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 9cf919c3dd0f..fbad1747b642 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1109,6 +1109,16 @@ function get_pool_prop # property pool zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } +# Get the specified vdev property in parsable format or fail +function get_vdev_prop +{ + typeset prop="$1" + typeset pool="$2" + typeset vdev="$3" + + zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev" +} + # Return 0 if a pool exists; $? otherwise # # $1 - pool name diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 2024c44cc138..7d667c5537c5 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -69,6 +69,7 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes +RAIDZ_READ_SIT_OUT_SECS vdev.raidz_read_sit_out_secs raidz_read_sit_out_secs REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index df183825dc68..9bbe99b0afd8 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1497,6 +1497,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/events_001_pos.ksh \ functional/events/events_002_pos.ksh \ functional/events/setup.ksh \ + functional/events/slow_vdev_sit_out.ksh \ functional/events/zed_cksum_config.ksh \ functional/events/zed_cksum_reported.ksh \ functional/events/zed_diagnose_multiple.ksh \ diff --git a/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh new file mode 100755 index 000000000000..27c3a8f5f823 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. + +# DESCRIPTION: +# Verify that vdevs 'sit out' when they are slow +# +# STRATEGY: +# 1. Create various raidz/draid pools +# 2. Inject delays into one of the disks +# 3. Verify disk is set to 'sit out' for awhile. +# 4. Wait for RAIDZ_READ_SIT_OUT_SECS and verify sit out state is lifted. +# + +. $STF_SUITE/include/libtest.shlib + +# Avoid name conflicts with the default pool +TESTPOOL2=testpool2 + +function cleanup +{ + restore_tunable RAIDZ_READ_SIT_OUT_SECS + log_must zinject -c all + destroy_pool $TESTPOOL2 + log_must rm -f $TEST_BASE_DIR/vdev.$$.* +} + +log_assert "Verify sit_out works" + +log_onexit cleanup + +# shorten sit out period for testing +save_tunable RAIDZ_READ_SIT_OUT_SECS +set_tunable32 RAIDZ_READ_SIT_OUT_SECS 5 + +log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9} + +for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do + log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9} + log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=100 + log_must zpool export $TESTPOOL2 + log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2 + + BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9 + + # Initial state should not be sitting out + log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]] + + # Delay our reads 200ms to trigger sit out + log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2 + + # Do some reads and wait for us to sit out + for i in {1..100} ; do + dd if=/$TESTPOOL2/bigfile skip=$i bs=1M count=1 of=/dev/null &> /dev/null + + sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV) + if [[ "$sit_out" == "on" ]] ; then + break + fi + done + + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on" + + # Clear fault injection + log_must zinject -c all + + # Wait for us to exit our sit out period + sleep 6 + log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" + destroy_pool $TESTPOOL2 +done + +log_pass "sit_out works correctly"