Skip to content

Commit

Permalink
Set timeout for benchmark jobs and expose mountpoint logs when failin…
Browse files Browse the repository at this point in the history
…g. (#528)

We have seen multiple timeouts from the benchmark recently and the default
6 hours timeout is too long. We should be able to fail faster because
we know how long each benchmark should be running from the job definitions.

We also want to get mountpoint logs from the failed job so that we can
investigate into the problem.

Signed-off-by: Monthon Klongklaew <[email protected]>
  • Loading branch information
monthonk authored Sep 21, 2023
1 parent 11def47 commit 3a81908
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 7 deletions.
39 changes: 34 additions & 5 deletions mountpoint-s3/scripts/fs_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ run_fio_job() {
job_file=$1
bench_file=$2
mount_dir=$3
log_dir=$4

job_name=$(basename "${job_file}")
job_name="${job_name%.*}"
Expand All @@ -51,13 +52,23 @@ run_fio_job() {
for i in $(seq 1 $iterations);
do
echo -n "${i};"
fio --thread \
# we know each fio job will be running for exactly 1 minute from the job definitions.
# there must be something wrong if it takes longer than that.
set +e
timeout 300s fio --thread \
--output=${results_dir}/${job_name}_iter${i}.json \
--output-format=json \
--directory=${mount_dir} \
--filename=${bench_file} \
--eta=never \
${job_file}
job_status=$?
set -e
if [ $job_status -ne 0 ]; then
tail -1000 ${log_dir}/mountpoint-s3-*
echo "Job ${job_name} failed with exit code ${job_status}"
exit 1
fi
done
echo "done"

Expand All @@ -75,11 +86,19 @@ read_benchmark () {
for job_file in "${jobs_dir}"/*.fio; do
mount_dir=$(mktemp -d /tmp/fio-XXXXXXXXXXXX)

job_name=$(basename "${job_file}")
job_name="${job_name%.*}"
log_dir=logs/${job_name}
rm -rf ${log_dir}
mkdir -p ${log_dir}

# mount file system
set +e
cargo run --quiet --release -- \
${S3_BUCKET_NAME} ${mount_dir} \
--debug \
--allow-delete \
--log-directory=${log_dir} \
--prefix=${S3_BUCKET_TEST_PREFIX}
mount_status=$?
set -e
Expand All @@ -96,26 +115,35 @@ read_benchmark () {
fi

# run the benchmark
run_fio_job $job_file $bench_file $mount_dir
run_fio_job $job_file $bench_file $mount_dir $log_dir

# unmount file system
sudo umount ${mount_dir}

# cleanup mount directory
# cleanup mount directory and log directory
rm -rf ${mount_dir}
rm -rf ${log_dir}
done
}

write_benchmark () {
jobs_dir=mountpoint-s3/scripts/fio/write

for job_file in "${jobs_dir}"/*.fio; do
job_name=$(basename "${job_file}")
job_name="${job_name%.*}"
log_dir=logs/${job_name}
rm -rf ${log_dir}
mkdir -p ${log_dir}

# mount file system
mount_dir=$(mktemp -d /tmp/fio-XXXXXXXXXXXX)
set +e
cargo run --quiet --release -- \
${S3_BUCKET_NAME} ${mount_dir} \
--debug \
--allow-delete \
--log-directory=${log_dir} \
--prefix=${S3_BUCKET_TEST_PREFIX}
mount_status=$?
set -e
Expand All @@ -128,13 +156,14 @@ write_benchmark () {
bench_file=${mount_dir}/${job_name}_${RANDOM}.dat

# run the benchmark
run_fio_job $job_file $bench_file $mount_dir
run_fio_job $job_file $bench_file $mount_dir $log_dir

# unmount file system
sudo umount ${mount_dir}

# cleanup mount directory
# cleanup mount directory and log directory
rm -rf ${mount_dir}
rm -rf ${log_dir}
done
}

Expand Down
28 changes: 26 additions & 2 deletions mountpoint-s3/scripts/fs_latency_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,16 @@ do
target_dir="${mount_dir}/bench_dir_${dir_size}"
startdelay=30

log_dir=logs/${job_name}
mkdir -p $log_dir

echo "Running ${job_name}"

# mount file system
cargo run --release ${S3_BUCKET_NAME} ${mount_dir} \
--debug \
--allow-delete \
--log-directory=$log_dir \
--prefix=${S3_BUCKET_TEST_PREFIX}
mount_status=$?
if [ $mount_status -ne 0 ]; then
Expand All @@ -67,7 +72,14 @@ do
iteration=10
for i in $(seq 1 $iteration);
do
/usr/bin/time -o ${results_dir}/time_output.txt -v ls -f "${target_dir}" >/dev/null 2>&1
# we don't know for sure how long does it take, but 5 minutes should be enough
timeout 300s /usr/bin/time -o ${results_dir}/time_output.txt -v ls -f "${target_dir}" >/dev/null 2>&1
job_status=$?
if [ $job_status -ne 0 ]; then
tail -1000 ${log_dir}/mountpoint-s3-*
echo "Job ${job_name} failed with exit code ${job_status}"
exit 1
fi

elapsed_time=$(awk '/Elapsed/ {print $8}' ${results_dir}/time_output.txt)

Expand Down Expand Up @@ -105,11 +117,16 @@ for job_file in "${jobs_dir}"/*.fio; do
job_name=$(basename "${job_file}")
job_name="${job_name%.*}"

log_dir=logs/${job_name}
mkdir -p $log_dir

echo "Running ${job_name}"

# mount file system
cargo run --release ${S3_BUCKET_NAME} ${mount_dir} \
--debug \
--allow-delete \
--log-directory=$log_dir \
--prefix=${S3_BUCKET_TEST_PREFIX}
mount_status=$?
if [ $mount_status -ne 0 ]; then
Expand All @@ -124,12 +141,19 @@ for job_file in "${jobs_dir}"/*.fio; do
bench_file=${S3_BUCKET_SMALL_BENCH_FILE}
fi

fio --thread \
# time to first byte should not be longer than 5 minutes
timeout 300s fio --thread \
--output=${results_dir}/${job_name}.json \
--output-format=json \
--directory=${mount_dir} \
--filename=${bench_file} \
${job_file}
job_status=$?
if [ $job_status -ne 0 ]; then
tail -1000 ${log_dir}/mountpoint-s3-*
echo "Job ${job_name} failed with exit code ${job_status}"
exit 1
fi

jq -n 'inputs.jobs[] | if (."job options".rw == "read")
then {name: .jobname, value: (.read.lat_ns.mean / 1000000), unit: "milliseconds"}
Expand Down

0 comments on commit 3a81908

Please sign in to comment.