-
Notifications
You must be signed in to change notification settings - Fork 599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CORE-8618 crash_tracker: record uncaught startup exceptions #24854
base: dev
Are you sure you want to change the base?
Changes from all commits
1fae3d0
40c0d38
c896f25
8a5ef27
452e456
cfae5ae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
/* | ||
* Copyright 2025 Redpanda Data, Inc. | ||
* | ||
* Use of this software is governed by the Business Source License | ||
* included in the file licenses/BSL.md | ||
* | ||
* As of the Change Date specified in that file, in accordance with | ||
* the Business Source License, use of this software will be governed | ||
* by the Apache License, Version 2.0 | ||
*/ | ||
|
||
#include "crash_tracker/prepared_writer.h" | ||
|
||
#include "crash_tracker/logger.h" | ||
#include "crash_tracker/types.h" | ||
#include "hashing/xx.h" | ||
#include "model/timestamp.h" | ||
|
||
#include <seastar/core/file-types.hh> | ||
#include <seastar/core/file.hh> | ||
#include <seastar/core/seastar.hh> | ||
#include <seastar/core/sleep.hh> | ||
#include <seastar/util/print_safe.hh> | ||
|
||
#include <fmt/chrono.h> | ||
|
||
#include <chrono> | ||
#include <fcntl.h> | ||
#include <system_error> | ||
#include <unistd.h> | ||
|
||
using namespace std::chrono_literals; | ||
|
||
namespace crash_tracker { | ||
|
||
std::ostream& operator<<(std::ostream& os, prepared_writer::state s) { | ||
switch (s) { | ||
case prepared_writer::state::uninitialized: | ||
return os << "uninitialized"; | ||
case prepared_writer::state::initialized: | ||
return os << "initialized"; | ||
case prepared_writer::state::filled: | ||
return os << "filled"; | ||
case prepared_writer::state::written: | ||
return os << "written"; | ||
case prepared_writer::state::released: | ||
return os << "released"; | ||
} | ||
} | ||
|
||
ss::future<> | ||
prepared_writer::initialize(std::filesystem::path crash_file_path) { | ||
_crash_report_file_name = std::move(crash_file_path); | ||
|
||
_serde_output.reserve_memory(crash_description::serde_size_overestimate); | ||
|
||
// Create the crash recorder file | ||
auto f = co_await ss::open_file_dma( | ||
_crash_report_file_name.c_str(), | ||
ss::open_flags::create | ss::open_flags::rw | ss::open_flags::truncate | ||
| ss::open_flags::exclusive); | ||
co_await f.close(); | ||
|
||
// Sync the parent dir to ensure that the newly create file is observable to | ||
// the ::open() call below and to later restarts of the process | ||
co_await ss::sync_directory(_crash_report_file_name.parent_path().string()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you protecting against a certain scenario by syncing the directory here? a comment would probably be helpful since it's likely important? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added this comment:
Let me know if that explains it well enough. |
||
|
||
// Open the crash recorder file using ::open(). | ||
// We need to use the low level open() function here instead of the seastar | ||
// API or higher-level C++ primitives because we need to be able to | ||
// manipulate the file using async-signal-safe, allocation-free functions | ||
// inside signal handlers. | ||
_fd = ::open(_crash_report_file_name.c_str(), O_WRONLY); | ||
if (_fd == -1) { | ||
throw std::system_error( | ||
errno, | ||
std::system_category(), | ||
fmt::format( | ||
"Failed to open {} to record crash reason", | ||
_crash_report_file_name)); | ||
} | ||
|
||
_state = state::initialized; | ||
} | ||
|
||
crash_description& prepared_writer::fill() { | ||
vassert(_state == state::initialized, "Unexpected state: {}", _state); | ||
_state = state::filled; | ||
_prepared_cd.crash_time = model::timestamp::now(); | ||
return _prepared_cd; | ||
} | ||
|
||
void prepared_writer::write() { | ||
vassert(_state == state::filled, "Unexpected state: {}", _state); | ||
_state = state::written; | ||
|
||
if (try_write_crash()) { | ||
constexpr static std::string_view success | ||
= "Recorded crash reason to crash file.\n"; | ||
ss::print_safe(success.data(), success.size()); | ||
} else { | ||
constexpr static std::string_view failure | ||
= "Failed to record crash reason to crash file.\n"; | ||
ss::print_safe(failure.data(), failure.size()); | ||
} | ||
} | ||
|
||
bool prepared_writer::try_write_crash() { | ||
bool success = true; | ||
serde::write(_serde_output, std::move(_prepared_cd)); | ||
|
||
for (const auto& frag : _serde_output) { | ||
size_t written = 0; | ||
while (written < frag.size()) { | ||
auto res = ::write( | ||
_fd, frag.get() + written, frag.size() - written); | ||
if (res == -1) { | ||
// Return that writing the crash failed but try to continue to | ||
// write later fragments as much information as possible | ||
success = false; | ||
break; | ||
} | ||
written += res; | ||
} | ||
} | ||
|
||
::fsync(_fd); | ||
michael-redpanda marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return success; | ||
} | ||
|
||
ss::future<> prepared_writer::release() { | ||
vassert(_state != state::released, "Unexpected state: {}", _state); | ||
|
||
if (_state != state::uninitialized) { | ||
::close(_fd); | ||
|
||
// Remove the file and sync the parent dir to ensure that later restarts | ||
// of the process cannot observe this crash report file | ||
co_await ss::remove_file(_crash_report_file_name.c_str()); | ||
co_await ss::sync_directory( | ||
_crash_report_file_name.parent_path().string()); | ||
Comment on lines
+141
to
+142
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sync is probably unnecessary, but maybe you could provide a little context on what you're trying to protect against? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the comment I've added:
Let me know if it explains it well enough. |
||
|
||
vlog( | ||
ctlog.debug, | ||
"Deleted crash report file: {}", | ||
_crash_report_file_name); | ||
} | ||
|
||
_state = state::released; | ||
|
||
co_return; | ||
} | ||
|
||
} // namespace crash_tracker |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* | ||
* Copyright 2025 Redpanda Data, Inc. | ||
* | ||
* Use of this software is governed by the Business Source License | ||
* included in the file licenses/BSL.md | ||
* | ||
* As of the Change Date specified in that file, in accordance with | ||
* the Business Source License, use of this software will be governed | ||
* by the Apache License, Version 2.0 | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "base/seastarx.h" | ||
#include "bytes/iobuf.h" | ||
#include "crash_tracker/types.h" | ||
|
||
namespace crash_tracker { | ||
|
||
// Helper to allow writing a crash_description out to a file in an async-signal | ||
// safe way. The state transition diagram for the object is shown below: | ||
// | ||
// clang-format off | ||
// +---------------+ initialize() +-------------+ fill() +--------+ write() +---------+ | ||
// | uninitialized +---------------->| initialized +---------->| filled +----------->| written | | ||
// +-----+---------+ +------+------+ +---+----+ +-+-------+ | ||
// | | | | | ||
// | | | | | ||
// | | | | | ||
// | | | | | ||
// | | | | | ||
// | | | release() | +--------+ | ||
// +----------------------------------+----------------------+-------------------+-->|released| | ||
// +--------+ | ||
// clang-format on | ||
class prepared_writer { | ||
public: | ||
// TODO: import the naming of the methods and their correspondence to the | ||
// state | ||
ss::future<> initialize(std::filesystem::path); | ||
ss::future<> release(); | ||
|
||
// Async-signal safe | ||
constexpr bool initialized() const { return _state == state::initialized; } | ||
|
||
/// Async-signal safe | ||
crash_description& fill(); | ||
|
||
/// Async-signal safe | ||
void write(); | ||
|
||
private: | ||
enum class state { uninitialized, initialized, filled, written, released }; | ||
friend std::ostream& operator<<(std::ostream&, state); | ||
|
||
// Returns true on success, false on failure | ||
bool try_write_crash(); | ||
|
||
state _state{state::uninitialized}; | ||
crash_description _prepared_cd; | ||
iobuf _serde_output; | ||
std::filesystem::path _crash_report_file_name; | ||
int _fd{0}; | ||
}; | ||
|
||
} // namespace crash_tracker |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how big are the crash reports? if they are too big, we may want to have space management track them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
~10KB, and we're going to limit to storing up to 50 of them, so their total size should be <1MB. We decided not to bother including them in space management or log listing queries because their space usage will be insignificant.