Skip to content

Commit

Permalink
Merge pull request #24959 from pgellert/crashlog/crash-signal-recording
Browse files Browse the repository at this point in the history
CORE-8685 crash_tracker: crash signal handlers for crash recording
  • Loading branch information
pgellert authored Feb 5, 2025
2 parents 692bfc5 + c2e59b7 commit b586de8
Show file tree
Hide file tree
Showing 9 changed files with 286 additions and 13 deletions.
2 changes: 2 additions & 0 deletions src/v/crash_tracker/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ redpanda_cc_library(
"prepared_writer.cc",
"recorder.cc",
"service.cc",
"signals.cc",
"types.cc",
],
hdrs = [
Expand All @@ -16,6 +17,7 @@ redpanda_cc_library(
"prepared_writer.h",
"recorder.h",
"service.h",
"signals.h",
"types.h",
],
implementation_deps = [
Expand Down
1 change: 1 addition & 0 deletions src/v/crash_tracker/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ v_cc_library(
prepared_writer.cc
recorder.cc
service.cc
signals.cc
types.cc
DEPS
Seastar::seastar
Expand Down
44 changes: 44 additions & 0 deletions src/v/crash_tracker/recorder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include <chrono>
#include <filesystem>
#include <string_view>

using namespace std::chrono_literals;

Expand Down Expand Up @@ -127,8 +128,51 @@ void print_skipping() {
ss::print_safe(skipping.data(), skipping.size());
}

void record_message(crash_description& cd, std::string_view msg) {
auto& format_buf = cd.crash_message;
fmt::format_to_n(
format_buf.begin(),
format_buf.capacity(),
"{} on shard {}.",
msg,
ss::this_shard_id());
}

} // namespace

/// Async-signal safe
void recorder::record_crash_sighandler(recorded_signo signo) {
auto* cd_opt = _writer.fill();
if (!cd_opt) {
// The writer has already been consumed by another crash
print_skipping();
return;
}
auto& cd = *cd_opt;

record_backtrace(cd);

switch (signo) {
case recorded_signo::sigsegv: {
cd.type = crash_type::segfault;
record_message(cd, "Segmentation fault");
break;
}
case recorded_signo::sigabrt: {
cd.type = crash_type::abort;
record_message(cd, "Aborting");
break;
}
case recorded_signo::sigill: {
cd.type = crash_type::illegal_instruction;
record_message(cd, "Illegal instruction");
break;
}
}

_writer.write();
}

void recorder::record_crash_exception(std::exception_ptr eptr) {
if (is_crash_loop_limit_reached(eptr)) {
// We specifically do not want to record crash_loop_limit_reached errors
Expand Down
4 changes: 3 additions & 1 deletion src/v/crash_tracker/recorder.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,16 @@ class recorder {
using include_malformed_files
= ss::bool_class<struct include_malformed_files_tag>;

enum class recorded_signo { sigsegv, sigabrt, sigill };

/// Visible for testing
~recorder() = default;

ss::future<> start();
ss::future<> stop();

/// Async-signal safe
void record_crash_sighandler(int signo);
void record_crash_sighandler(recorded_signo signo);

void record_crash_exception(std::exception_ptr eptr);

Expand Down
94 changes: 94 additions & 0 deletions src/v/crash_tracker/signals.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright 2025 Redpanda Data, Inc.
*
* Use of this software is governed by the Business Source License
* included in the file licenses/BSL.md
*
* As of the Change Date specified in that file, in accordance with
* the Business Source License, use of this software will be governed
* by the Apache License, Version 2.0
*/

#include "crash_tracker/recorder.h"

#include <seastar/util/print_safe.hh>

#include <atomic>

namespace crash_tracker {

namespace {

// Installs handler for Signal which ensures that Func is invoked only once
// in the whole program. After Func is invoked, the previously-installed signal
// handler is restored and triggered.
template<int Signal, void (*Func)()>
void install_oneshot_signal_handler() {
static ss::util::spinlock lock{};

// Capture the original (seastar's or SIG_DFL) signal handler so we can
// run it after redpanda's signal handler
static struct sigaction original_sa;

struct sigaction sa;
sa.sa_sigaction = [](int sig, siginfo_t*, void*) {
std::lock_guard<ss::util::spinlock> g{lock};

Func();

// Reinstall the original signal handler after calling Func() to ensure
// that a concurrent signal on another thread does not terminate the
// process before Func() finishes
auto r = ::sigaction(sig, &original_sa, nullptr);
if (r == -1) {
// This should not be possible given the man page of sigaction
// but it is prudent to handle this case by reinstalling the
// default sighandler
constexpr static std::string_view sigaction_failure
= "Failed to reinstall original signal handler\n";
ss::print_safe(sigaction_failure.data(), sigaction_failure.size());

::signal(sig, SIG_DFL);
}

// Reraise the signal to trigger the original signal handler
pthread_kill(pthread_self(), sig);
};
sigfillset(&sa.sa_mask);
sa.sa_flags = SA_SIGINFO | SA_RESTART;
if (Signal == SIGSEGV) {
sa.sa_flags |= SA_ONSTACK;
}
auto r = ::sigaction(Signal, &sa, &original_sa);
ss::throw_system_error_on(r == -1);

// Unblock the signal on all reactor threads to ensure that our handler is
// called
ss::smp::invoke_on_all([] {
auto mask = ss::make_sigset_mask(Signal);
ss::throw_pthread_error(::pthread_sigmask(SIG_UNBLOCK, &mask, nullptr));
}).get();
}

void sigsegv_action() {
constexpr auto signo = crash_tracker::recorder::recorded_signo::sigsegv;
crash_tracker::get_recorder().record_crash_sighandler(signo);
}
void sigabrt_action() {
constexpr auto signo = crash_tracker::recorder::recorded_signo::sigabrt;
crash_tracker::get_recorder().record_crash_sighandler(signo);
}
void sigill_action() {
constexpr auto signo = crash_tracker::recorder::recorded_signo::sigill;
crash_tracker::get_recorder().record_crash_sighandler(signo);
}

} // namespace

void install_sighandlers() {
install_oneshot_signal_handler<SIGSEGV, sigsegv_action>();
install_oneshot_signal_handler<SIGABRT, sigabrt_action>();
install_oneshot_signal_handler<SIGILL, sigill_action>();
}

} // namespace crash_tracker
18 changes: 18 additions & 0 deletions src/v/crash_tracker/signals.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Copyright 2025 Redpanda Data, Inc.
*
* Use of this software is governed by the Business Source License
* included in the file licenses/BSL.md
*
* As of the Change Date specified in that file, in accordance with
* the Business Source License, use of this software will be governed
* by the Apache License, Version 2.0
*/

#pragma once

namespace crash_tracker {

void install_sighandlers();

} // namespace crash_tracker
2 changes: 2 additions & 0 deletions src/v/redpanda/application.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
#include "config/node_config.h"
#include "config/seed_server.h"
#include "config/types.h"
#include "crash_tracker/signals.h"
#include "crypto/ossl_context_service.h"
#include "datalake/cloud_data_io.h"
#include "datalake/coordinator/catalog_factory.h"
Expand Down Expand Up @@ -1028,6 +1029,7 @@ void application::check_environment() {
void application::init_crashtracker(::stop_signal& app_signal) {
_crash_tracker_service = std::make_unique<crash_tracker::service>();
_crash_tracker_service->start(app_signal.abort_source()).get();
crash_tracker::install_sighandlers();
}

void application::schedule_crash_tracker_file_cleanup() {
Expand Down
39 changes: 35 additions & 4 deletions tests/rptest/services/redpanda.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,10 @@ class RedpandaServiceBase(RedpandaServiceABC, Service):
}
}

# Thread name of shards to be used with redpanda_tid()
SHARD_0_THREAD_NAME = "redpanda"
SHARD_1_THREAD_NAME = "reactor-1"

class FIPSMode(Enum):
disabled = 0
permissive = 1
Expand Down Expand Up @@ -2550,8 +2554,6 @@ def __init__(self,
# which can kill redpanda nodes.
# This is a number to allow multiple callers to set it.
self.tolerate_not_running = 0
# Do not fail test on crashes including asserts. This is useful when
# running with redpanda's fault injection enabled.
self._tolerate_crashes = False
self._rpk_node_config = rpk_node_config

Expand Down Expand Up @@ -3131,12 +3133,20 @@ def check_node(node):

return all(self.for_nodes(self._started, check_node))

def signal_redpanda(self, node, signal=signal.SIGKILL, idempotent=False):
def signal_redpanda(self,
node,
signal=signal.SIGKILL,
idempotent=False,
thread=None):
"""
:param idempotent: if true, then kill-like signals are ignored if
the process is already gone.
:param thread: if set, then the signal is sent to the given thread
"""
pid = self.redpanda_pid(node)
if thread is None:
pid = self.redpanda_pid(node)
else:
pid = self.redpanda_tid(node, thread)
if pid is None:
if idempotent and signal in {signal.SIGKILL, signal.SIGTERM}:
return
Expand Down Expand Up @@ -4188,6 +4198,20 @@ def redpanda_pid(self, node):

raise e

def redpanda_tid(self, node, thread):
"""Return the thread ID of the given thread"""
cmd = "ps -C redpanda -T"
for line in node.account.ssh_capture(cmd, timeout_sec=10):
# Example line:
# PID SPID TTY TIME CMD
# 2662879 2662879 pts/16 00:00:02 redpanda
self.logger.debug(f"ps output: {line}")
parts = line.split()
thread_name = parts[4]
if thread == thread_name:
return int(parts[1])
return None

def started_nodes(self) -> List[ClusterNode]:
return list(self._started)

Expand Down Expand Up @@ -5342,6 +5366,13 @@ def set_up_failure_injection(self, finject_cfg: FailureInjectionConfig,

self.logger.info(f"Set up failure injection config for nodes: {nodes}")

def set_tolerate_crashes(self, tolerate_crashes: bool):
"""
Do not fail test on crashes including asserts. This is useful when
running with redpanda's fault injection enabled.
"""
self._tolerate_crashes = tolerate_crashes

def validate_controller_log(self):
"""
This method is for use at end of tests, to detect issues that might
Expand Down
Loading

0 comments on commit b586de8

Please sign in to comment.