-
Notifications
You must be signed in to change notification settings - Fork 599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CORE-8618 crash_tracker: record uncaught startup exceptions #24854
base: dev
Are you sure you want to change the base?
Changes from 1 commit
1fae3d0
40c0d38
c896f25
8a5ef27
452e456
cfae5ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ v_cc_library( | |
SRCS | ||
limiter.cc | ||
logger.cc | ||
prepared_writer.cc | ||
recorder.cc | ||
service.cc | ||
types.cc | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
/* | ||
* Copyright 2025 Redpanda Data, Inc. | ||
* | ||
* Use of this software is governed by the Business Source License | ||
* included in the file licenses/BSL.md | ||
* | ||
* As of the Change Date specified in that file, in accordance with | ||
* the Business Source License, use of this software will be governed | ||
* by the Apache License, Version 2.0 | ||
*/ | ||
|
||
#include "crash_tracker/prepared_writer.h" | ||
|
||
#include "crash_tracker/logger.h" | ||
#include "crash_tracker/types.h" | ||
#include "hashing/xx.h" | ||
#include "model/timestamp.h" | ||
|
||
#include <seastar/core/file-types.hh> | ||
#include <seastar/core/file.hh> | ||
#include <seastar/core/seastar.hh> | ||
#include <seastar/core/sleep.hh> | ||
#include <seastar/util/print_safe.hh> | ||
|
||
#include <fmt/chrono.h> | ||
|
||
#include <chrono> | ||
#include <fcntl.h> | ||
#include <system_error> | ||
#include <unistd.h> | ||
|
||
using namespace std::chrono_literals; | ||
|
||
namespace crash_tracker { | ||
|
||
std::ostream& operator<<(std::ostream& os, prepared_writer::state s) { | ||
switch (s) { | ||
case prepared_writer::state::uninitialized: | ||
return os << "uninitialized"; | ||
case prepared_writer::state::initialized: | ||
return os << "initialized"; | ||
case prepared_writer::state::filled: | ||
return os << "filled"; | ||
case prepared_writer::state::written: | ||
return os << "written"; | ||
case prepared_writer::state::released: | ||
return os << "released"; | ||
} | ||
} | ||
|
||
ss::future<> | ||
prepared_writer::initialize(std::filesystem::path crash_file_path) { | ||
_crash_report_file_name = std::move(crash_file_path); | ||
|
||
_serde_output.reserve_memory(crash_description::serde_size_overestimate); | ||
|
||
// Create the crash recorder file | ||
auto f = co_await ss::open_file_dma( | ||
_crash_report_file_name.c_str(), | ||
ss::open_flags::create | ss::open_flags::rw | ss::open_flags::truncate | ||
| ss::open_flags::exclusive); | ||
co_await f.close(); | ||
|
||
// Sync the parent dir to ensure that the newly create file is observable to | ||
// the ::open() call below and to later restarts of the process | ||
co_await ss::sync_directory(_crash_report_file_name.parent_path().string()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you protecting against a certain scenario by syncing the directory here? a comment would probably be helpful since it's likely important? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added this comment:
Let me know if that explains it well enough. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it doesn't hurt anything, but you don't really need to do the sync for these cases--it's only good for if the actual node crashes. |
||
|
||
// Open the crash recorder file using ::open(). | ||
// We need to use the low level open() function here instead of the seastar | ||
// API or higher-level C++ primitives because we need to be able to | ||
// manipulate the file using async-signal-safe, allocation-free functions | ||
// inside signal handlers. | ||
_fd = ::open(_crash_report_file_name.c_str(), O_WRONLY); | ||
if (_fd == -1) { | ||
throw std::system_error( | ||
errno, | ||
std::system_category(), | ||
fmt::format( | ||
"Failed to open {} to record crash reason", | ||
_crash_report_file_name)); | ||
} | ||
|
||
_state = state::initialized; | ||
} | ||
|
||
crash_description& prepared_writer::fill() { | ||
vassert(_state == state::initialized, "Unexpected state: {}", _state); | ||
_state = state::filled; | ||
_prepared_cd.crash_time = model::timestamp::now(); | ||
return _prepared_cd; | ||
} | ||
|
||
void prepared_writer::write() { | ||
vassert(_state == state::filled, "Unexpected state: {}", _state); | ||
_state = state::written; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this transition correct if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, it's correct though I get that the naming is a bit misleading. Maybe
We could add a separate |
||
|
||
if (try_write_crash()) { | ||
constexpr static std::string_view success | ||
= "Recorded crash reason to crash file.\n"; | ||
ss::print_safe(success.data(), success.size()); | ||
} else { | ||
constexpr static std::string_view failure | ||
= "Failed to record crash reason to crash file.\n"; | ||
ss::print_safe(failure.data(), failure.size()); | ||
} | ||
} | ||
|
||
bool prepared_writer::try_write_crash() { | ||
bool success = true; | ||
serde::write(_serde_output, std::move(_prepared_cd)); | ||
|
||
for (const auto& frag : _serde_output) { | ||
size_t written = 0; | ||
while (written < frag.size()) { | ||
auto res = ::write( | ||
_fd, frag.get() + written, frag.size() - written); | ||
if (res == -1) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did you consider retrying on |
||
// Return that writing the crash failed but try to continue to | ||
// write later fragments as much information as possible | ||
success = false; | ||
break; | ||
} | ||
written += res; | ||
} | ||
} | ||
|
||
::fsync(_fd); | ||
michael-redpanda marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return success; | ||
} | ||
|
||
ss::future<> prepared_writer::release() { | ||
vassert(_state != state::released, "Unexpected state: {}", _state); | ||
|
||
if (_state != state::uninitialized) { | ||
::close(_fd); | ||
|
||
// Remove the file and sync the parent dir to ensure that later restarts | ||
// of the process cannot observe this crash report file | ||
co_await ss::remove_file(_crash_report_file_name.c_str()); | ||
co_await ss::sync_directory( | ||
_crash_report_file_name.parent_path().string()); | ||
Comment on lines
+141
to
+142
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sync is probably unnecessary, but maybe you could provide a little context on what you're trying to protect against? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the comment I've added:
Let me know if it explains it well enough. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. visibility is handled by the kernel. process restarts don't need the sync. it doesn't hurt anything, except performance if you were doing many syncs. |
||
|
||
vlog( | ||
ctlog.debug, | ||
"Deleted crash report file: {}", | ||
_crash_report_file_name); | ||
} | ||
|
||
_state = state::released; | ||
|
||
co_return; | ||
} | ||
|
||
} // namespace crash_tracker |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* | ||
* Copyright 2025 Redpanda Data, Inc. | ||
* | ||
* Use of this software is governed by the Business Source License | ||
* included in the file licenses/BSL.md | ||
* | ||
* As of the Change Date specified in that file, in accordance with | ||
* the Business Source License, use of this software will be governed | ||
* by the Apache License, Version 2.0 | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "base/seastarx.h" | ||
#include "bytes/iobuf.h" | ||
#include "crash_tracker/types.h" | ||
|
||
namespace crash_tracker { | ||
|
||
// Helper to allow writing a crash_description out to a file in an async-signal | ||
// safe way. The state transition diagram for the object is shown below: | ||
// | ||
// clang-format off | ||
// +---------------+ initialize() +-------------+ fill() +--------+ write() +---------+ | ||
// | uninitialized +---------------->| initialized +---------->| filled +----------->| written | | ||
// +-----+---------+ +------+------+ +---+----+ +-+-------+ | ||
// | | | | | ||
// | | | | | ||
// | | | | | ||
// | | | | | ||
// | | | | | ||
// | | | release() | +--------+ | ||
// +----------------------------------+----------------------+-------------------+-->|released| | ||
// +--------+ | ||
// clang-format on | ||
class prepared_writer { | ||
public: | ||
// TODO: import the naming of the methods and their correspondence to the | ||
// state | ||
ss::future<> initialize(std::filesystem::path); | ||
ss::future<> release(); | ||
|
||
// Async-signal safe | ||
constexpr bool initialized() const { return _state == state::initialized; } | ||
|
||
/// Async-signal safe | ||
crash_description& fill(); | ||
|
||
/// Async-signal safe | ||
void write(); | ||
|
||
private: | ||
enum class state { uninitialized, initialized, filled, written, released }; | ||
friend std::ostream& operator<<(std::ostream&, state); | ||
|
||
// Returns true on success, false on failure | ||
bool try_write_crash(); | ||
|
||
state _state{state::uninitialized}; | ||
crash_description _prepared_cd; | ||
iobuf _serde_output; | ||
std::filesystem::path _crash_report_file_name; | ||
int _fd{0}; | ||
}; | ||
|
||
} // namespace crash_tracker |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
my original comment about using
exclusive
here was actually a note to myself to revisit my comment after finishing the review, but i forgot. the problem withexclusive
is that this open will fail if the file already exists on disk. is that going to be an issue, or is exclusive behavior desired? my hunch is that you may not want it, but the note to myself was just to ask if you wanted it.