Skip to content

Commit

Permalink
Add support for pre-caching process VMA metadata
Browse files Browse the repository at this point in the history
Users may legitimately be interested in being able to symbolize
addresses inside a process even if the process has exited in the
meantime.
To support such use cases, introduce a new top-level API to the
Symbolizer: the cache() method allows for caching various data
pertaining a symbolization source. At this point we only support caching
of process VMA metadata, but in the future we can support pre-parsing
ELF and DWARF metadata and similar, which could speed up symbolization
requests significantly.

Refs: libbpf#433

Signed-off-by: Daniel Müller <[email protected]>
  • Loading branch information
d-e-s-o committed Feb 10, 2025
1 parent ea6b24e commit 68951d2
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
Unreleased
----------
- Added support for 32 bit ELF binaries
- Added `symbolize::Symbolizer::cache` method for caching process VMA
metadata
- Renamed `symbolize::Source::kernel_image` to `vmlinux`
- Adjusted kernel symbolization logic to give preference to `vmlinux`
file, if present
Expand Down
111 changes: 111 additions & 0 deletions src/symbolize/cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
//! Definitions of symbolization caching targets.
use std::fmt::Debug;
use std::fmt::Formatter;
use std::fmt::Result as FmtResult;

use crate::Pid;


/// Configuration for caching of process-level data.
#[derive(Clone)]
pub struct Process {
/// The referenced process' ID.
pub pid: Pid,
/// Whether to cache the process' VMAs for later use.
///
/// Caching VMAs can be useful, because it conceptually enables the
/// library to serve a symbolization request targeting a process
/// even if said process has since exited the system.
///
/// Note that once VMAs have been cached this way, the library will
/// refrain from re-reading updated VMAs unless instructed to.
/// Hence, if you have reason to believe that a process may have
/// changed its memory regions (by loading a new shared object, for
/// example), you would have to make another request to cache them
/// yourself.
///
/// Note furthermore that if you cache VMAs to later symbolize
/// addresses after the original process has already exited, you
/// will have to opt-out of usage of `/proc/<pid>/map_files/` as
/// part of the symbolization request. Refer to
/// [`source::Process::map_files`][crate::symbolize::source::Process::map_files].
pub cache_vmas: bool,
/// The struct is non-exhaustive and open to extension.
#[doc(hidden)]
pub _non_exhaustive: (),
}

impl Process {
/// Create a new [`Process`] object using the provided `pid`.
///
/// `cache_vmas` default to `true` when using this constructor.
#[inline]
pub fn new(pid: Pid) -> Self {
Self {
pid,
cache_vmas: true,
_non_exhaustive: (),
}
}
}

impl Debug for Process {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
let Self {
pid,
cache_vmas: _,
_non_exhaustive: (),
} = self;

f.debug_tuple(stringify!(Process))
.field(&format_args!("{pid}"))
.finish()
}
}

impl From<Process> for Cache<'static> {
#[inline]
fn from(process: Process) -> Self {
Self::Process(process)
}
}


/// A description of what data to use to cache in advance, so that
/// subsequent symbolization requests can be satisfied quicker.
#[derive(Clone)]
#[non_exhaustive]
pub enum Cache<'dat> {
/// Information about a process.
Process(Process),
#[doc(hidden)]
Phantom(&'dat ()),
}

impl Debug for Cache<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
match self {
Self::Process(process) => Debug::fmt(process, f),
Self::Phantom(()) => unreachable!(),
}
}
}


#[cfg(test)]
mod tests {
use super::*;


/// Exercise the `Debug` representation of various types.
#[test]
fn debug_repr() {
let process = Process::new(Pid::Slf);
assert_eq!(format!("{process:?}"), "Process(self)");
let process = Process::new(Pid::from(1234));
assert_eq!(format!("{process:?}"), "Process(1234)");
let cache = Cache::from(process);
assert_eq!(format!("{cache:?}"), "Process(1234)");
}
}
1 change: 1 addition & 0 deletions src/symbolize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
//! [`gsym-in-apk`](https://github.com/libbpf/blazesym/blob/main/examples/gsym-in-apk)
//! example, which illustrates the basic workflow.
pub mod cache;
pub mod source;
mod symbolizer;

Expand Down
65 changes: 59 additions & 6 deletions src/symbolize/symbolizer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use std::borrow::Cow;
use std::cell::RefCell;
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fmt::Debug;
use std::fmt::Formatter;
Expand Down Expand Up @@ -54,6 +56,8 @@ use crate::IntoError as _;
use crate::Pid;
use crate::Result;

use super::cache;
use super::cache::Cache;
#[cfg(feature = "apk")]
use super::source::Apk;
#[cfg(feature = "breakpad")]
Expand Down Expand Up @@ -396,6 +400,7 @@ impl Builder {
gsym_cache: FileCache::builder().enable_auto_reload(auto_reload).build(),
ksym_cache: FileCache::builder().enable_auto_reload(auto_reload).build(),
perf_map_cache: FileCache::builder().enable_auto_reload(auto_reload).build(),
process_vma_cache: RefCell::new(HashMap::new()),
process_cache: InsertMap::new(),
find_sym_opts,
demangle,
Expand Down Expand Up @@ -627,6 +632,11 @@ pub struct Symbolizer {
gsym_cache: FileCache<GsymResolver<'static>>,
ksym_cache: FileCache<Rc<KsymResolver>>,
perf_map_cache: FileCache<PerfMap>,
/// Cache of VMA data on per-process basis.
///
/// This member is only populated by explicit requests for caching
/// data by the user.
process_vma_cache: RefCell<HashMap<Pid, Box<[maps::MapsEntry]>>>,
process_cache: InsertMap<PathName, Option<Box<dyn Resolve>>>,
find_sym_opts: FindSymOpts,
demangle: bool,
Expand Down Expand Up @@ -898,9 +908,6 @@ impl Symbolizer {
perf_map: bool,
map_files: bool,
) -> Result<Vec<Symbolized>> {
let mut entry_iter = maps::parse_filtered(pid)?;
let entries = |_addr| entry_iter.next();

let mut handler = SymbolizeHandler {
symbolizer: self,
pid,
Expand All @@ -914,9 +921,27 @@ impl Symbolizer {
addrs,
|handler: &mut SymbolizeHandler<'_>| handler.all_symbols.as_mut_slice(),
|sorted_addrs| -> Result<SymbolizeHandler<'_>> {
let () =
normalize_sorted_user_addrs_with_entries(sorted_addrs, entries, &mut handler)?;
Ok(handler)
if let Some(cached) = self.process_vma_cache.borrow().get(&pid) {
let mut entry_iter = cached.iter().map(Ok);
let entries = |_addr| entry_iter.next();

let () = normalize_sorted_user_addrs_with_entries(
sorted_addrs,
entries,
&mut handler,
)?;
Ok(handler)
} else {
let mut entry_iter = maps::parse_filtered(pid)?;
let entries = |_addr| entry_iter.next();

let () = normalize_sorted_user_addrs_with_entries(
sorted_addrs,
entries,
&mut handler,
)?;
Ok(handler)
}
},
)?;
Ok(handler.all_symbols)
Expand Down Expand Up @@ -1022,6 +1047,34 @@ impl Symbolizer {
))
}

/// Cache some or all information associated with a symbolization
/// source.
///
/// Symbolization data is generally being cached when symbolization
/// is performed. However, sometimes it is necessary to cache data
/// early, for example to make subsequent symbolization requests as
/// fast running as possible. In rare instances it can also be a
/// matter of correctness. Process metadata such as VMAs and their
/// offsets can be cached so that even after the processes exited it
/// symbolization requests can still be satisfied.
#[cfg_attr(feature = "tracing", crate::log::instrument(skip_all, fields(cache = ?cache), err))]
pub fn cache(&self, cache: &Cache) -> Result<()> {
match cache {
Cache::Process(cache::Process {
pid,
cache_vmas,
_non_exhaustive: (),
}) => {
if *cache_vmas {
let parsed = maps::parse_filtered(*pid)?.collect::<Result<Box<_>>>()?;
let _prev = self.process_vma_cache.borrow_mut().insert(*pid, parsed);
}
}
Cache::Phantom(()) => unreachable!(),
}
Ok(())
}

/// Symbolize a list of addresses.
///
/// Symbolize a list of addresses using the provided symbolization
Expand Down
40 changes: 40 additions & 0 deletions tests/suite/symbolize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use std::process;
use blazesym::helper::ElfResolver;
use blazesym::inspect;
use blazesym::normalize;
use blazesym::symbolize::cache;
use blazesym::symbolize::source::Breakpad;
use blazesym::symbolize::source::Elf;
use blazesym::symbolize::source::GsymData;
Expand Down Expand Up @@ -788,6 +789,45 @@ fn symbolize_process_in_mount_namespace() {
});
}

/// Check that we can symbolize addresses from a process that has
/// already exited, based on VMA data cached earlier.
#[cfg(linux)]
#[test]
fn symbolize_process_exited_cached_vmas() {
let test_so = Path::new(&env!("CARGO_MANIFEST_DIR"))
.join("data")
.join("libtest-so.so");
let wait = Path::new(&env!("CARGO_MANIFEST_DIR"))
.join("data")
.join("test-wait.bin");

let symbolizer = Symbolizer::new();

let (pid, addr) = RemoteProcess::default()
.arg(&test_so)
.exec(&wait, |pid, addr| {
// Cache VMA information about the process while it is alive.
let () = symbolizer
.cache(&cache::Cache::from(cache::Process::new(pid)))
.unwrap();
(pid, addr)
});

// By now the process is guaranteed to be dead (modulo PID reuse...).
let mut process = Process::new(pid);
// We need to opt out of map file usage, because those files will no
// longer be present with the process having exited.
process.map_files = false;

let src = Source::Process(process);
let result = symbolizer
.symbolize_single(&src, Input::AbsAddr(addr))
.unwrap()
.into_sym()
.unwrap();
assert_eq!(result.name, "await_input");
}

/// Check that we can symbolize an address residing in a zip archive.
#[test]
fn symbolize_process_zip() {
Expand Down

0 comments on commit 68951d2

Please sign in to comment.