Skip to content

Commit

Permalink
More changes from recent feedback
Browse files Browse the repository at this point in the history
also added test from Tony

Signed-off-by: Don Brady <[email protected]>
  • Loading branch information
don-brady committed Jan 9, 2025
1 parent 64d5138 commit fd1252a
Show file tree
Hide file tree
Showing 12 changed files with 149 additions and 50 deletions.
2 changes: 1 addition & 1 deletion include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ typedef enum {
VDEV_PROP_TRIM_SUPPORT,
VDEV_PROP_TRIM_ERRORS,
VDEV_PROP_SLOW_IOS,
VDEV_PROP_SIT_OUT_READS,
VDEV_PROP_SIT_OUT,
VDEV_NUM_PROPS
} vdev_prop_t;

Expand Down
2 changes: 1 addition & 1 deletion lib/libzfs/libzfs.abi
Original file line number Diff line number Diff line change
Expand Up @@ -5917,7 +5917,7 @@
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
<enumerator name='VDEV_PROP_SIT_OUT_READS' value='52'/>
<enumerator name='VDEV_PROP_SIT_OUT' value='52'/>
<enumerator name='VDEV_NUM_PROPS' value='53'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
Expand Down
2 changes: 1 addition & 1 deletion lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -5478,7 +5478,7 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
/* Only use if provided by the RAIDZ VDEV above */
if (prop == VDEV_PROP_RAIDZ_EXPANDING)
return (ENOENT);
if (prop == VDEV_PROP_SIT_OUT_READS)
if (prop == VDEV_PROP_SIT_OUT)
return (ENOENT);
}
if (vdev_prop_index_to_string(prop, intval,
Expand Down
16 changes: 10 additions & 6 deletions man/man7/vdevprops.7
Original file line number Diff line number Diff line change
Expand Up @@ -104,19 +104,23 @@ Comma separated list of children of this vdev
The number of children belonging to this vdev
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
The number of errors of each type encountered by this vdev
.It Sy sit_out_reads
.It Sy sit_out
True when a slow disk outlier was detected and the vdev is currently in a sit
out state.
While sitting out, the vdev will not participate in normal reads, instead its
data will be reconstructed as needed from parity.
.It Sy slow_ios
The number of slow I/Os encountered by this vdev,
These represent I/O operations that didn't complete in
This indicates the number of slow I/O operations encountered by this vdev.
A slow I/O is defined as an operation that did not complete within the
.Sy zio_slow_io_ms
milliseconds
threshold in milliseconds
.Pq Sy 30000 No by default .
Can also be incremented when a vdev was determined to be a raidz leaf latency
outlier.
For
.Sy RAIDZ
and
.Sy DRAID
configurations, this value also represents the number of times the vdev was
identified as an outlier and excluded from participating in read I/O operations.
.It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
The number of I/O operations of each type performed by this vdev
.It Xo
Expand Down
6 changes: 3 additions & 3 deletions module/zcommon/zpool_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,9 @@ vdev_prop_init(void)
zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_SIT_OUT_READS, "sit_out_reads", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT_READS",
boolean_table, sfeatures);
zprop_register_index(VDEV_PROP_SIT_OUT, "sit_out", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "SIT_OUT", boolean_table,
sfeatures);
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
boolean_table, sfeatures);
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -6363,7 +6363,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ZPROP_SRC_NONE);
}
continue;
case VDEV_PROP_SIT_OUT_READS:
case VDEV_PROP_SIT_OUT:
/* Only expose this for a draid or raidz leaf */
if (vd->vdev_ops->vdev_op_leaf &&
vd->vdev_top != NULL &&
Expand Down
60 changes: 23 additions & 37 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -2302,11 +2302,11 @@ vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
if (raidz_read_sit_out_secs == 0)
return (B_FALSE);

/* Avoid skipping a data column read when resilvering */
if (io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
/* Avoid skipping a data column read when scrubbing */
if (io_flags & ZIO_FLAG_SCRUB)
return (B_FALSE);

return (vd->vdev_read_sit_out_expire >= gethrtime());
return (vd->vdev_read_sit_out_expire >= gethrestime_sec());
}

/*
Expand All @@ -2318,10 +2318,10 @@ vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
static uint64_t
calculate_ewma(uint64_t previous_ewma, uint64_t latest_value) {
/*
* Scale using 16 bits with an effective alpha of 0.50
* Scale using 8 bits with an effective alpha of 0.25
*/
const uint64_t scale = 16;
const uint64_t alpha = 32768;
const uint64_t scale = 8;
const uint64_t alpha = 64;

return (((alpha * latest_value) + (((1ULL << scale) - alpha) *
previous_ewma)) >> scale);
Expand Down Expand Up @@ -2874,10 +2874,14 @@ latency_quartiles_fence(const uint64_t *data, size_t n)
else
q3 = latency_median_value(&data[(n+1) >> 1], n>>1);

uint64_t iqr = q3 - q1;
uint64_t fence = q3 + iqr;
/*
* To avoid detecting false positive outliers when N is small and
* and the latencies values are very close, make sure the fence
* is at least 25% larger than Q1.
*/
uint64_t iqr = MAX(q3 - q1, q1 >> 3);

return (fence);
return (q3 + (iqr << 1));
}

#define LAT_SAMPLES_STACK 64
Expand Down Expand Up @@ -2908,14 +2912,6 @@ vdev_child_slow_outlier(zio_t *zio)
if (raidz_read_sit_out_secs == 0 || vd->vdev_children < LAT_SAMPLES_MIN)
return;

spa_t *spa = zio->io_spa;
if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
spa_load_state(spa) == SPA_LOAD_RECOVER ||
(spa_load_state(spa) != SPA_LOAD_NONE &&
spa->spa_last_open_failed)) {
return;
}

hrtime_t now = gethrtime();
uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);

Expand All @@ -2934,14 +2930,12 @@ vdev_child_slow_outlier(zio_t *zio)
lat_data = &data[0];

uint64_t max = 0;
uint64_t max_outier_count = 0;
vdev_t *svd = NULL; /* suspect vdev */
vdev_t *ovd = NULL; /* largest outlier vdev */
for (int c = 0; c < samples; c++) {
vdev_t *cvd = vd->vdev_child[c];

if (cvd->vdev_read_sit_out_expire != 0) {
if (cvd->vdev_read_sit_out_expire < gethrtime()) {
if (cvd->vdev_read_sit_out_expire < gethrestime_sec()) {
/*
* Done with our sit out, wait for new outlier
* to emerge.
Expand All @@ -2965,12 +2959,6 @@ vdev_child_slow_outlier(zio_t *zio)
max = lat_data[c];
svd = cvd;
}

uint64_t count = atomic_load_64(&cvd->vdev_outlier_count);
if (count > max_outier_count) {
max_outier_count = count;
ovd = cvd;
}
}

qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare);
Expand All @@ -2982,28 +2970,26 @@ vdev_child_slow_outlier(zio_t *zio)
* higher than peers outlier count will be considered
* a slow disk.
*/
if (atomic_add_64_nv(&svd->vdev_outlier_count, 1) >
LAT_OUTLIER_LIMIT && svd == ovd &&
svd->vdev_read_sit_out_expire == 0) {
if (++svd->vdev_outlier_count > LAT_OUTLIER_LIMIT) {
ASSERT0(svd->vdev_read_sit_out_expire);
/*
* Begin a sit out period for this slow drive
*/
svd->vdev_read_sit_out_expire = gethrtime() +
SEC2NSEC(raidz_read_sit_out_secs);
svd->vdev_read_sit_out_expire = gethrestime_sec() +
raidz_read_sit_out_secs;

/* count each slow io period */
mutex_enter(&svd->vdev_stat_lock);
svd->vdev_stat.vs_slow_ios++;
mutex_exit(&svd->vdev_stat_lock);

(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY, spa, svd,
NULL, NULL, 0);
(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
zio->io_spa, svd, NULL, NULL, 0);
vdev_dbgmsg(svd, "begin read sit out for %d secs",
(int)raidz_read_sit_out_secs);
for (int c = 0; c < vd->vdev_children; c++) {
atomic_store_64(
&vd->vdev_child[c]->vdev_outlier_count, 0);
}

for (int c = 0; c < vd->vdev_children; c++)
vd->vdev_child[c]->vdev_outlier_count = 0;
}
}
out:
Expand Down
4 changes: 4 additions & 0 deletions tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,10 @@ tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines',
'dio_unaligned_block', 'dio_unaligned_filesize']
tags = ['functional', 'direct']

[tests/functional/events]
tests = ['slow_vdev_sit_out']
tags = ['functional', 'events']

[tests/functional/exec]
tests = ['exec_001_pos', 'exec_002_neg']
tags = ['functional', 'exec']
Expand Down
10 changes: 10 additions & 0 deletions tests/zfs-tests/include/libtest.shlib
Original file line number Diff line number Diff line change
Expand Up @@ -1109,6 +1109,16 @@ function get_pool_prop # property pool
zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool"
}

# Get the specified vdev property in parsable format or fail
function get_vdev_prop
{
typeset prop="$1"
typeset pool="$2"
typeset vdev="$3"

zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev"
}

# Return 0 if a pool exists; $? otherwise
#
# $1 - pool name
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/include/tunables.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval
OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize
PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable
RAIDZ_EXPAND_MAX_REFLOW_BYTES vdev.expand_max_reflow_bytes raidz_expand_max_reflow_bytes
RAIDZ_READ_SIT_OUT_SECS vdev.raidz_read_sit_out_secs raidz_read_sit_out_secs
REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled
REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress
REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1497,6 +1497,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/events_001_pos.ksh \
functional/events/events_002_pos.ksh \
functional/events/setup.ksh \
functional/events/slow_vdev_sit_out.ksh \
functional/events/zed_cksum_config.ksh \
functional/events/zed_cksum_reported.ksh \
functional/events/zed_diagnose_multiple.ksh \
Expand Down
93 changes: 93 additions & 0 deletions tests/zfs-tests/tests/functional/events/slow_vdev_sit_out.ksh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#

# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.

# DESCRIPTION:
# Verify that vdevs 'sit out' when they are slow
#
# STRATEGY:
# 1. Create various raidz/draid pools
# 2. Inject delays into one of the disks
# 3. Verify disk is set to 'sit out' for awhile.
# 4. Wait for RAIDZ_READ_SIT_OUT_SECS and verify sit out state is lifted.
#

. $STF_SUITE/include/libtest.shlib

# Avoid name conflicts with the default pool
TESTPOOL2=testpool2

function cleanup
{
restore_tunable RAIDZ_READ_SIT_OUT_SECS
log_must zinject -c all
destroy_pool $TESTPOOL2
log_must rm -f $TEST_BASE_DIR/vdev.$$.*
}

log_assert "Verify sit_out works"

log_onexit cleanup

# shorten sit out period for testing
save_tunable RAIDZ_READ_SIT_OUT_SECS
set_tunable32 RAIDZ_READ_SIT_OUT_SECS 5

log_must truncate -s 150M $TEST_BASE_DIR/vdev.$$.{0..9}

for raidtype in raidz raidz2 raidz3 draid1 draid2 draid3 ; do
log_must zpool create $TESTPOOL2 $raidtype $TEST_BASE_DIR/vdev.$$.{0..9}
log_must dd if=/dev/urandom of=/$TESTPOOL2/bigfile bs=1M count=100
log_must zpool export $TESTPOOL2
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL2

BAD_VDEV=$TEST_BASE_DIR/vdev.$$.9

# Initial state should not be sitting out
log_must eval [[ "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off" ]]

# Delay our reads 200ms to trigger sit out
log_must zinject -d $BAD_VDEV -D200:1 -T read $TESTPOOL2

# Do some reads and wait for us to sit out
for i in {1..100} ; do
dd if=/$TESTPOOL2/bigfile skip=$i bs=1M count=1 of=/dev/null &> /dev/null

sit_out=$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)
if [[ "$sit_out" == "on" ]] ; then
break
fi
done

log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "on"

# Clear fault injection
log_must zinject -c all

# Wait for us to exit our sit out period
sleep 6
log_must test "$(get_vdev_prop sit_out $TESTPOOL2 $BAD_VDEV)" == "off"
destroy_pool $TESTPOOL2
done

log_pass "sit_out works correctly"

0 comments on commit fd1252a

Please sign in to comment.