Skip to content

Commit

Permalink
dt/crash_tracker: test crash reports with signals
Browse files Browse the repository at this point in the history
This adds a test to verify that crash reports captured while handling
crash signals (segfaults, sigabrt, sigill) are correctly being reports.
It also verifies that the generated crash reports can be read back later
and are correctly printed when the crash loop limit is reached.

This requires exposing `_tolerate_crashes` from `RedpandaService` to
prevent test failures on crash-detecting checks, since for this test,
the crashes are expected.
  • Loading branch information
pgellert committed Jan 28, 2025
1 parent 2fc681f commit 5710b4a
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 2 deletions.
9 changes: 7 additions & 2 deletions tests/rptest/services/redpanda.py
Original file line number Diff line number Diff line change
Expand Up @@ -2550,8 +2550,6 @@ def __init__(self,
# which can kill redpanda nodes.
# This is a number to allow multiple callers to set it.
self.tolerate_not_running = 0
# Do not fail test on crashes including asserts. This is useful when
# running with redpanda's fault injection enabled.
self._tolerate_crashes = False
self._rpk_node_config = rpk_node_config

Expand Down Expand Up @@ -5342,6 +5340,13 @@ def set_up_failure_injection(self, finject_cfg: FailureInjectionConfig,

self.logger.info(f"Set up failure injection config for nodes: {nodes}")

def set_tolerate_crashes(self, tolerate_crashes: bool):
"""
Do not fail test on crashes including asserts. This is useful when
running with redpanda's fault injection enabled.
"""
self._tolerate_crashes = tolerate_crashes

def validate_controller_log(self):
"""
This method is for use at end of tests, to detect issues that might
Expand Down
66 changes: 66 additions & 0 deletions tests/rptest/tests/crash_loop_checks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0

import signal
from rptest.services.cluster import cluster
from rptest.tests.redpanda_test import RedpandaTest
from rptest.services.redpanda import RedpandaService
from rptest.util import expect_exception
from rptest.services.redpanda import LoggingConfig
from ducktape.errors import TimeoutError
from ducktape.mark import matrix
from ducktape.utils.util import wait_until
from rptest.utils.mode_checks import skip_debug_mode


class CrashLoopChecksTest(RedpandaTest):
Expand All @@ -25,6 +29,12 @@ class CrashLoopChecksTest(RedpandaTest):
".*Failure during startup: crash_tracker::crash_loop_limit_reached \(Crash loop detected, aborting startup.\).*"
]

SIGNAL_CRASH_LOG = [
"Aborting on",
"Segmentation fault on",
"Illegal instruction on",
]

# main - application.cc:348 - Failure during startup: std::__1::system_error (error C-Ares:4, unreachable_host.com: Not found)
# main - application.cc:363 - Failure during startup: std::__1::system_error (error C-Ares:11, unreachable_host.com: Connection refused)
HOSTNAME_ERRORS = [
Expand Down Expand Up @@ -68,6 +78,18 @@ def expect_crash_count(self, expected):
crash_files = self.count_crash_files(self.broker)
assert crash_files == expected, f"Unexpected number of crashes: {crash_files} != {expected}"

def wait_for_redpanda_stop(self, broker, timeout=10):
'''
Wait for the redpanda process to terminate (e.g. after sending a crash signal)
'''
wait_until(
lambda: self.redpanda.redpanda_pid(broker) == None,
timeout_sec=timeout,
backoff_sec=0.2,
err_msg=
f"Redpanda processes did not terminate on {broker.name} in {timeout} sec"
)

@cluster(num_nodes=1, log_allow_list=CRASH_LOOP_LOG)
def test_crash_loop_checks_with_tracker_file(self):
broker = self.redpanda.nodes[0]
Expand Down Expand Up @@ -177,3 +199,47 @@ def test_crash_report_with_startup_exception(self):
"Crash #4 at 20.* UTC - Failure during startup: std::__1::system_error (error C-Ares:4, unreachable_host.com: Not found) Backtrace: 0x.*"
)
self.expect_crash_count(1 + CrashLoopChecksTest.CRASH_LOOP_LIMIT + 1)

@cluster(num_nodes=1, log_allow_list=CRASH_LOOP_LOG + SIGNAL_CRASH_LOG)
@matrix(signo=[signal.SIGABRT, signal.SIGSEGV, signal.SIGILL])
def test_crash_report_with_signal(self, signo):
self.redpanda.set_tolerate_crashes(True)
broker = self.redpanda.nodes[0]

# Send a crash signal to redpanda CRASH_LOOP_LIMIT times
for _ in range(CrashLoopChecksTest.CRASH_LOOP_LIMIT):
self.redpanda.signal_redpanda(broker, signo)
self.wait_for_redpanda_stop(broker)
self.redpanda.start_node(broker)

# Expect to see a crash report for each crash + a new one for the last
# start_node
self.expect_crash_count(CrashLoopChecksTest.CRASH_LOOP_LIMIT + 1)

# Sanity check the crash loop limit message has not been printed yet
assert not self.redpanda.search_log_node(
broker, "Too many consecutive crashes")

# Send a crash signal + start again, now reaching the crash loop limit.
self.redpanda.signal_redpanda(broker, signo)
self.wait_for_redpanda_stop(broker)
self.redpanda.start_node(broker, expect_fail=True)

# Assert the crash loop limit message is printed with information about
# the crashes
assert self.redpanda.search_log_node(broker,
"Too many consecutive crashes")

def signo_prefix():
if signo == signal.SIGSEGV:
return "Segmentation fault"
elif signo == signal.SIGABRT:
return "Aborting"
elif signo == signal.SIGILL:
return "Illegal instruction"
else:
assert False, "Test failure: not yet implemented"

assert self.redpanda.search_log_node(
broker,
f"Crash #4 at 20.* - {signo_prefix()} on shard.* Backtrace: ")

0 comments on commit 5710b4a

Please sign in to comment.