Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CORE-8685 crash_tracker: crash signal handlers for crash recording #24959

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/v/crash_tracker/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ redpanda_cc_library(
"prepared_writer.cc",
"recorder.cc",
"service.cc",
"signals.cc",
"types.cc",
],
hdrs = [
Expand All @@ -16,6 +17,7 @@ redpanda_cc_library(
"prepared_writer.h",
"recorder.h",
"service.h",
"signals.h",
"types.h",
],
implementation_deps = [
Expand Down
1 change: 1 addition & 0 deletions src/v/crash_tracker/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ v_cc_library(
prepared_writer.cc
recorder.cc
service.cc
signals.cc
types.cc
DEPS
Seastar::seastar
Expand Down
44 changes: 44 additions & 0 deletions src/v/crash_tracker/recorder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include <chrono>
#include <filesystem>
#include <string_view>

using namespace std::chrono_literals;

Expand Down Expand Up @@ -127,8 +128,51 @@ void print_skipping() {
ss::print_safe(skipping.data(), skipping.size());
}

void record_message(crash_description& cd, std::string_view msg) {
auto& format_buf = cd.crash_message;
fmt::format_to_n(
format_buf.begin(),
format_buf.capacity(),
"{} on shard {}.",
msg,
ss::this_shard_id());
}

} // namespace

/// Async-signal safe
void recorder::record_crash_sighandler(recorded_signo signo) {
auto* cd_opt = _writer.fill();
if (!cd_opt) {
// The writer has already been consumed by another crash
print_skipping();
return;
}
auto& cd = *cd_opt;

record_backtrace(cd);

switch (signo) {
case recorded_signo::sigsegv: {
cd.type = crash_type::segfault;
record_message(cd, "Segmentation fault");
break;
}
case recorded_signo::sigabrt: {
cd.type = crash_type::abort;
record_message(cd, "Aborting");
break;
}
case recorded_signo::sigill: {
cd.type = crash_type::illegal_instruction;
record_message(cd, "Illegal instruction");
break;
}
}

_writer.write();
}

void recorder::record_crash_exception(std::exception_ptr eptr) {
if (is_crash_loop_limit_reached(eptr)) {
// We specifically do not want to record crash_loop_limit_reached errors
Expand Down
4 changes: 3 additions & 1 deletion src/v/crash_tracker/recorder.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,16 @@ class recorder {
using include_malformed_files
= ss::bool_class<struct include_malformed_files_tag>;

enum class recorded_signo { sigsegv, sigabrt, sigill };

/// Visible for testing
~recorder() = default;

ss::future<> start();
ss::future<> stop();

/// Async-signal safe
void record_crash_sighandler(int signo);
void record_crash_sighandler(recorded_signo signo);

void record_crash_exception(std::exception_ptr eptr);

Expand Down
94 changes: 94 additions & 0 deletions src/v/crash_tracker/signals.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright 2025 Redpanda Data, Inc.
*
* Use of this software is governed by the Business Source License
* included in the file licenses/BSL.md
*
* As of the Change Date specified in that file, in accordance with
* the Business Source License, use of this software will be governed
* by the Apache License, Version 2.0
*/

#include "crash_tracker/recorder.h"

#include <seastar/util/print_safe.hh>

#include <atomic>

namespace crash_tracker {

namespace {

// Installs handler for Signal which ensures that Func is invoked only once
// in the whole program. After Func is invoked, the previously-installed signal
// handler is restored and triggered.
template<int Signal, void (*Func)()>
void install_oneshot_signal_handler() {
static ss::util::spinlock lock{};

// Capture the original (seastar's or SIG_DFL) signal handler so we can
// run it after redpanda's signal handler
static struct sigaction original_sa;

struct sigaction sa;
sa.sa_sigaction = [](int sig, siginfo_t*, void*) {
std::lock_guard<ss::util::spinlock> g{lock};

Func();

// Reinstall the original signal handler after calling Func() to ensure
// that a concurrent signal on another thread does not terminate the
// process before Func() finishes
auto r = ::sigaction(sig, &original_sa, nullptr);
if (r == -1) {
// This should not be possible given the man page of sigaction
// but it is prudent to handle this case by reinstalling the
// default sighandler
constexpr static std::string_view sigaction_failure
= "Failed to reinstall original signal handler\n";
ss::print_safe(sigaction_failure.data(), sigaction_failure.size());

::signal(sig, SIG_DFL);
}

// Reraise the signal to trigger the original signal handler
pthread_kill(pthread_self(), sig);
};
sigfillset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO | SA_RESTART;
if (Signal == SIGSEGV) {
sa.sa_flags |= SA_ONSTACK;
}
auto r = ::sigaction(Signal, &sa, &original_sa);
ss::throw_system_error_on(r == -1);

// Unblock the signal on all reactor threads to ensure that our handler is
// called
ss::smp::invoke_on_all([] {
auto mask = ss::make_sigset_mask(Signal);
ss::throw_pthread_error(::pthread_sigmask(SIG_UNBLOCK, &mask, nullptr));
}).get();
}

void sigsegv_action() {
constexpr auto signo = crash_tracker::recorder::recorded_signo::sigsegv;
crash_tracker::get_recorder().record_crash_sighandler(signo);
}
void sigabrt_action() {
constexpr auto signo = crash_tracker::recorder::recorded_signo::sigabrt;
crash_tracker::get_recorder().record_crash_sighandler(signo);
}
void sigill_action() {
constexpr auto signo = crash_tracker::recorder::recorded_signo::sigill;
crash_tracker::get_recorder().record_crash_sighandler(signo);
}

} // namespace

void install_sighandlers() {
install_oneshot_signal_handler<SIGSEGV, sigsegv_action>();
install_oneshot_signal_handler<SIGABRT, sigabrt_action>();
install_oneshot_signal_handler<SIGILL, sigill_action>();
}

} // namespace crash_tracker
18 changes: 18 additions & 0 deletions src/v/crash_tracker/signals.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Copyright 2025 Redpanda Data, Inc.
*
* Use of this software is governed by the Business Source License
* included in the file licenses/BSL.md
*
* As of the Change Date specified in that file, in accordance with
* the Business Source License, use of this software will be governed
* by the Apache License, Version 2.0
*/

#pragma once

namespace crash_tracker {

void install_sighandlers();

} // namespace crash_tracker
2 changes: 2 additions & 0 deletions src/v/redpanda/application.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
#include "config/node_config.h"
#include "config/seed_server.h"
#include "config/types.h"
#include "crash_tracker/signals.h"
#include "crypto/ossl_context_service.h"
#include "datalake/cloud_data_io.h"
#include "datalake/coordinator/catalog_factory.h"
Expand Down Expand Up @@ -1028,6 +1029,7 @@ void application::check_environment() {
void application::init_crashtracker(::stop_signal& app_signal) {
_crash_tracker_service = std::make_unique<crash_tracker::service>();
_crash_tracker_service->start(app_signal.abort_source()).get();
crash_tracker::install_sighandlers();
}

void application::schedule_crash_tracker_file_cleanup() {
Expand Down
39 changes: 35 additions & 4 deletions tests/rptest/services/redpanda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,10 @@ class RedpandaServiceBase(RedpandaServiceABC, Service):
}
}

# Thread name of shards to be used with redpanda_tid()
SHARD_0_THREAD_NAME = "redpanda"
SHARD_1_THREAD_NAME = "reactor-1"

class FIPSMode(Enum):
disabled = 0
permissive = 1
Expand Down Expand Up @@ -2550,8 +2554,6 @@ def __init__(self,
# which can kill redpanda nodes.
# This is a number to allow multiple callers to set it.
self.tolerate_not_running = 0
# Do not fail test on crashes including asserts. This is useful when
# running with redpanda's fault injection enabled.
self._tolerate_crashes = False
self._rpk_node_config = rpk_node_config

Expand Down Expand Up @@ -3131,12 +3133,20 @@ def check_node(node):

return all(self.for_nodes(self._started, check_node))

def signal_redpanda(self, node, signal=signal.SIGKILL, idempotent=False):
def signal_redpanda(self,
node,
signal=signal.SIGKILL,
idempotent=False,
thread=None):
"""
:param idempotent: if true, then kill-like signals are ignored if
the process is already gone.
:param thread: if set, then the signal is sent to the given thread
"""
pid = self.redpanda_pid(node)
if thread is None:
pid = self.redpanda_pid(node)
else:
pid = self.redpanda_tid(node, thread)
if pid is None:
if idempotent and signal in {signal.SIGKILL, signal.SIGTERM}:
return
Expand Down Expand Up @@ -4188,6 +4198,20 @@ def redpanda_pid(self, node):

raise e

def redpanda_tid(self, node, thread):
"""Return the thread ID of the given thread"""
cmd = "ps -C redpanda -T"
for line in node.account.ssh_capture(cmd, timeout_sec=10):
# Example line:
# PID SPID TTY TIME CMD
# 2662879 2662879 pts/16 00:00:02 redpanda
self.logger.debug(f"ps output: {line}")
parts = line.split()
thread_name = parts[4]
if thread == thread_name:
return int(parts[1])
return None

def started_nodes(self) -> List[ClusterNode]:
return list(self._started)

Expand Down Expand Up @@ -5342,6 +5366,13 @@ def set_up_failure_injection(self, finject_cfg: FailureInjectionConfig,

self.logger.info(f"Set up failure injection config for nodes: {nodes}")

def set_tolerate_crashes(self, tolerate_crashes: bool):
"""
Do not fail test on crashes including asserts. This is useful when
running with redpanda's fault injection enabled.
"""
self._tolerate_crashes = tolerate_crashes

def validate_controller_log(self):
"""
This method is for use at end of tests, to detect issues that might
Expand Down
Loading