Skip to content

Commit

Permalink
Don't collect system metrics on-demand, use internal ticker:
Browse files Browse the repository at this point in the history
- Update prometheus metrics directly (when metrics are enabled), remove extra message passing.
- Start using `RPC_RECV_BYTES` and `RPC_SENT_BYTES` instead of stubbed
  values in external metrics.
- Collect epoch metrics only when needed.
- Add CLI option to specify metrics update interval, set default value
  to 5 seconds.
- Don't update system metrics if no one is requesting them. This may
  result in an outdated metrics during first request to /metrics, but
  allows to avoid unnecessary work.

Grafana:

- Track metrics server response times in Grafana.
- Update network dashboard by adding various aggregation queries to
  multiple charts.
- Update overview dashboard to use previously renamed metrics.

Other:

- Rename `metrics` variable to `metrics_enabled` to better reflect its boolean nature.
- Bump `sys_info` dependency.
- Fix print message formatting.
  • Loading branch information
Tumas committed Dec 20, 2024
1 parent afe5d97 commit 2259019
Show file tree
Hide file tree
Showing 25 changed files with 1,809 additions and 1,322 deletions.
6 changes: 2 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ snap = '1'
static_assertions = '1'
strum = { version = '0.26', features = ['derive'] }
syn = { version = '2', features = ['full'] }
sysinfo = '0.31'
sysinfo = '0.33'
tap = '1'
tempfile = '3'
test-case = '3'
Expand Down
35 changes: 20 additions & 15 deletions grandine/src/grandine_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ use reqwest::header::HeaderValue;
use runtime::{
MetricsConfig, StorageConfig, DEFAULT_ETH1_DB_SIZE, DEFAULT_ETH2_DB_SIZE,
DEFAULT_LIBP2P_IPV4_PORT, DEFAULT_LIBP2P_IPV6_PORT, DEFAULT_LIBP2P_QUIC_IPV4_PORT,
DEFAULT_LIBP2P_QUIC_IPV6_PORT, DEFAULT_METRICS_PORT, DEFAULT_REQUEST_TIMEOUT,
DEFAULT_TARGET_PEERS, DEFAULT_TARGET_SUBNET_PEERS, DEFAULT_TIMEOUT,
DEFAULT_LIBP2P_QUIC_IPV6_PORT, DEFAULT_METRICS_PORT, DEFAULT_METRICS_UPDATE_INTERVAL_SECONDS,
DEFAULT_REQUEST_TIMEOUT, DEFAULT_TARGET_PEERS, DEFAULT_TARGET_SUBNET_PEERS, DEFAULT_TIMEOUT,
};
use serde::{de::DeserializeOwned, Serialize};
use serde_json::Value;
Expand Down Expand Up @@ -343,8 +343,8 @@ struct BeaconNodeOptions {
back_sync_enabled: bool,

/// Collect Prometheus metrics
#[clap(long)]
metrics: bool,
#[clap(long = "metrics")]
metrics_enabled: bool,

/// Metrics address for metrics endpoint
#[clap(long, default_value_t = IpAddr::V4(Ipv4Addr::LOCALHOST))]
Expand All @@ -354,6 +354,10 @@ struct BeaconNodeOptions {
#[clap(long, default_value_t = DEFAULT_METRICS_PORT)]
metrics_port: u16,

/// Update system metrics every n seconds
#[clap(long, default_value_t = DEFAULT_METRICS_UPDATE_INTERVAL_SECONDS)]
metrics_update_interval: u64,

/// Optional remote metrics URL that Grandine will periodically send metrics to
#[clap(long)]
remote_metrics_url: Option<RedactingUrl>,
Expand Down Expand Up @@ -499,7 +503,7 @@ impl NetworkConfigOptions {
self,
network: Network,
network_dir: PathBuf,
metrics: bool,
metrics_enabled: bool,
in_memory: bool,
) -> NetworkConfig {
let Self {
Expand Down Expand Up @@ -542,7 +546,7 @@ impl NetworkConfigOptions {
network_config.discv5_config.enr_update = !disable_enr_auto_update;
network_config.upnp_enabled = !disable_upnp;
network_config.network_dir = in_memory.not().then_some(network_dir);
network_config.metrics_enabled = metrics;
network_config.metrics_enabled = metrics_enabled;
network_config.target_peers = target_peers;
network_config.target_subnet_peers = target_subnet_peers;
network_config.trusted_peers = trusted_peers;
Expand Down Expand Up @@ -900,9 +904,10 @@ impl GrandineArgs {
jwt_secret,
jwt_version,
back_sync_enabled,
metrics,
metrics_enabled,
metrics_address,
metrics_port,
metrics_update_interval,
remote_metrics_url,
track_liveness,
detect_doppelgangers,
Expand Down Expand Up @@ -949,9 +954,10 @@ impl GrandineArgs {
warn!("both --configuration-file and --verify-configuration-file specified");
}

if remote_metrics_url.is_some() && !metrics {
if remote_metrics_url.is_some() && !metrics_enabled {
warn!(
"Remote metrics enabled without ---metrics. Network metrics will not be available"
"remote metrics enabled without ---metrics. \
Network, system and process metrics will not be available"
);
}

Expand Down Expand Up @@ -1100,7 +1106,7 @@ impl GrandineArgs {
);

// enable global feature for easier checking
if metrics {
if metrics_enabled {
features.push(Feature::PrometheusMetrics);
}

Expand All @@ -1112,15 +1118,15 @@ impl GrandineArgs {
|| features.contains(&Feature::PrometheusMetrics)
|| features.contains(&Feature::ServeLeakyEndpoints))
.then(|| MetricsServiceConfig {
remote_metrics_url,
directories: directories.clone_arc(),
metrics_update_interval: Duration::from_secs(metrics_update_interval),
remote_metrics_url,
});

let metrics_server_config = metrics.then_some(MetricsServerConfig {
let metrics_server_config = metrics_enabled.then_some(MetricsServerConfig {
metrics_address,
metrics_port,
timeout: request_timeout,
directories: directories.clone_arc(),
});

let http_api_config = HttpApiConfig::from(http_api_options);
Expand All @@ -1147,8 +1153,7 @@ impl GrandineArgs {
);
}

let metrics_enabled = metrics;
let metrics = if metrics {
let metrics = if metrics_enabled {
let metrics = Metrics::new()?;
metrics.register_with_default_metrics()?;
let metrics = Arc::new(metrics);
Expand Down
9 changes: 8 additions & 1 deletion grandine/src/grandine_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,18 @@ impl GrandineConfig {

if let Some(metrics_server_config) = &metrics_config.metrics_server_config {
info!(
"Metrics server address: {}",
"metrics server address: {}",
SocketAddr::from(metrics_server_config),
);
}

if let Some(metrics_service_config) = &metrics_config.metrics_service_config {
info!(
"metrics service configured with {:?} update interval",
metrics_service_config.metrics_update_interval,
);
}

if let Some(validator_api_config) = validator_api_config.as_ref() {
info!("validator API address: {}", validator_api_config.address);
} else {
Expand Down
1 change: 0 additions & 1 deletion http_api/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ itertools = { workspace = true }
liveness_tracker = { workspace = true }
log = { workspace = true }
mediatype = { workspace = true }
metrics = { workspace = true }
mime = { workspace = true }
operation_pools = { workspace = true }
p2p = { workspace = true }
Expand Down
1 change: 0 additions & 1 deletion http_api/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,6 @@ impl<P: Preset> Context<P> {

let channels = Channels {
api_to_liveness_tx: Some(api_to_liveness_tx),
api_to_metrics_tx: None,
api_to_p2p_tx,
api_to_validator_tx,
subnet_service_tx,
Expand Down
17 changes: 0 additions & 17 deletions http_api/src/global.rs

This file was deleted.

1 change: 0 additions & 1 deletion http_api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ mod block_id;
mod error;
mod extractors;
mod full_config;
mod global;
mod gui;
mod http_api_config;
mod middleware;
Expand Down
17 changes: 5 additions & 12 deletions http_api/src/routing.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::{collections::HashSet, sync::Arc};

use anyhow::Error as AnyhowError;
use axum::{
extract::{DefaultBodyLimit, FromRef, State},
routing::{get, post},
Expand All @@ -14,7 +15,6 @@ use futures::channel::mpsc::UnboundedSender;
use genesis::AnchorCheckpointProvider;
use http_api_utils::EventChannels;
use liveness_tracker::ApiToLiveness;
use metrics::ApiToMetrics;
use operation_pools::{AttestationAggPool, BlsToExecutionChangePool, SyncCommitteeAggPool};
use p2p::{ApiToP2p, NetworkConfig, ToSubnetService};
use prometheus_metrics::Metrics;
Expand All @@ -25,7 +25,6 @@ use validator::{ApiToValidator, ValidatorConfig};

use crate::{
error::Error,
global::{self},
gui, middleware,
misc::{BackSyncedStatus, SyncedStatus},
standard::{
Expand Down Expand Up @@ -80,7 +79,6 @@ pub struct NormalState<P: Preset, W: Wait> {
pub is_back_synced: Arc<BackSyncedStatus>,
pub event_channels: Arc<EventChannels>,
pub api_to_liveness_tx: Option<UnboundedSender<ApiToLiveness>>,
pub api_to_metrics_tx: Option<UnboundedSender<ApiToMetrics>>,
pub api_to_p2p_tx: UnboundedSender<ApiToP2p<P>>,
pub api_to_validator_tx: UnboundedSender<ApiToValidator<P>>,
pub subnet_service_tx: UnboundedSender<ToSubnetService>,
Expand Down Expand Up @@ -178,12 +176,6 @@ impl<P: Preset, W: Wait> FromRef<NormalState<P, W>> for Option<UnboundedSender<A
}
}

impl<P: Preset, W: Wait> FromRef<NormalState<P, W>> for Option<UnboundedSender<ApiToMetrics>> {
fn from_ref(state: &NormalState<P, W>) -> Self {
state.api_to_metrics_tx.clone()
}
}

impl<P: Preset, W: Wait> FromRef<NormalState<P, W>> for UnboundedSender<ApiToP2p<P>> {
fn from_ref(state: &NormalState<P, W>) -> Self {
state.api_to_p2p_tx.clone()
Expand Down Expand Up @@ -302,11 +294,12 @@ fn gui_routes<P: Preset, W: Wait>() -> Router<NormalState<P, W>> {
.route(
"/system/stats",
get(|extracted| async {
let State(api_to_metrics_tx) = extracted;
let State::<Option<Arc<Metrics>>>(metrics) = extracted;

global::get_system_stats(api_to_metrics_tx)
.await
metrics
.map(|metrics| metrics.system_stats())
.map(Json)
.ok_or_else(|| AnyhowError::msg("metrics service is not configured"))
.map_err(Error::Internal)
})
.route_layer(axum::middleware::map_request_with_state(
Expand Down
4 changes: 0 additions & 4 deletions http_api/src/task.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ use genesis::AnchorCheckpointProvider;
use http_api_utils::{ApiMetrics, EventChannels};
use liveness_tracker::ApiToLiveness;
use log::info;
use metrics::ApiToMetrics;
use operation_pools::{AttestationAggPool, BlsToExecutionChangePool, SyncCommitteeAggPool};
use p2p::{ApiToP2p, NetworkConfig, SyncToApi, ToSubnetService};
use prometheus_metrics::Metrics;
Expand All @@ -35,7 +34,6 @@ use crate::{

pub struct Channels<P: Preset> {
pub api_to_liveness_tx: Option<UnboundedSender<ApiToLiveness>>,
pub api_to_metrics_tx: Option<UnboundedSender<ApiToMetrics>>,
pub api_to_p2p_tx: UnboundedSender<ApiToP2p<P>>,
pub api_to_validator_tx: UnboundedSender<ApiToValidator<P>>,
pub subnet_service_tx: UnboundedSender<ToSubnetService>,
Expand Down Expand Up @@ -100,7 +98,6 @@ impl<P: Preset, W: Wait> HttpApi<P, W> {

let Channels {
api_to_liveness_tx,
api_to_metrics_tx,
api_to_p2p_tx,
api_to_validator_tx,
subnet_service_tx,
Expand All @@ -127,7 +124,6 @@ impl<P: Preset, W: Wait> HttpApi<P, W> {
is_back_synced: is_back_synced.clone_arc(),
event_channels,
api_to_liveness_tx,
api_to_metrics_tx,
api_to_p2p_tx,
api_to_validator_tx,
subnet_service_tx,
Expand Down
12 changes: 12 additions & 0 deletions http_api_utils/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use prometheus_metrics::Metrics;
#[derive(Clone, Copy)]
enum ApiType {
Http,
Metrics,
Validator,
}

Expand All @@ -25,6 +26,14 @@ impl ApiMetrics {
}
}

#[must_use]
pub const fn metrics(prometheus_metrics: Arc<Metrics>) -> Self {
Self {
api_type: ApiType::Metrics,
prometheus_metrics,
}
}

#[must_use]
pub const fn validator(prometheus_metrics: Arc<Metrics>) -> Self {
Self {
Expand All @@ -38,6 +47,9 @@ impl ApiMetrics {
ApiType::Http => self
.prometheus_metrics
.set_http_api_response_time(labels, response_duration),
ApiType::Metrics => self
.prometheus_metrics
.set_metrics_api_response_time(labels, response_duration),
ApiType::Validator => self
.prometheus_metrics
.set_validator_api_response_time(labels, response_duration),
Expand Down
1 change: 0 additions & 1 deletion metrics/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ chrono = { workspace = true }
derive_more = { workspace = true }
directories = { workspace = true }
eth1_api = { workspace = true }
fork_choice_control = { workspace = true }
futures = { workspace = true }
grandine_version = { workspace = true }
helper_functions = { workspace = true, features = ['metrics'] }
Expand Down
28 changes: 12 additions & 16 deletions metrics/src/beaconchain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ use eth1_api::{Eth1ConnectionData, Eth1Metrics};
use grandine_version::{APPLICATION_NAME, APPLICATION_VERSION};
use helper_functions::{accessors, predicates};
use log::warn;
use p2p::metrics::PEERS_CONNECTED;
use prometheus::IntGauge;
use p2p::metrics::{PEERS_CONNECTED, RPC_RECV_BYTES, RPC_SENT_BYTES};
use prometheus::{IntCounter, IntGauge};
use psutil::{cpu::CpuTimes, process::Process};
use serde::Serialize;
use sysinfo::{Disks, System};
Expand Down Expand Up @@ -139,8 +139,8 @@ impl ProcessMetrics {
#[derive(Serialize)]
pub struct BeaconNodeMetrics {
disk_beaconchain_bytes_total: u64,
network_libp2p_bytes_total_receive: i64,
network_libp2p_bytes_total_transmit: i64,
network_libp2p_bytes_total_receive: u64,
network_libp2p_bytes_total_transmit: u64,
network_peers_connected: i64,
sync_eth1_connected: bool,
sync_eth2_synced: bool,
Expand Down Expand Up @@ -176,19 +176,15 @@ impl BeaconNodeMetrics {
.map(IntGauge::get)
.unwrap_or_default();

// TODO(feature/metrics): figure this out with prometheus_client
// let network_libp2p_bytes_total_receive = INBOUND_LIBP2P_BYTES
// .as_ref()
// .map(IntGauge::get)
// .unwrap_or_default();

// let network_libp2p_bytes_total_transmit = OUTBOUND_LIBP2P_BYTES
// .as_ref()
// .map(IntGauge::get)
// .unwrap_or_default();
let network_libp2p_bytes_total_receive = RPC_RECV_BYTES
.as_ref()
.map(IntCounter::get)
.unwrap_or_default();

let network_libp2p_bytes_total_receive = 0;
let network_libp2p_bytes_total_transmit = 0;
let network_libp2p_bytes_total_transmit = RPC_SENT_BYTES
.as_ref()
.map(IntCounter::get)
.unwrap_or_default();

let disk_beaconchain_bytes_total = config.directories.disk_usage().unwrap_or_default();

Expand Down
Loading

0 comments on commit 2259019

Please sign in to comment.