Skip to content

Commit

Permalink
Add gubernator dashboard (#563)
Browse files Browse the repository at this point in the history
* wip

Signed-off-by: Thibault Mange <[email protected]>

* wip

Signed-off-by: Thibault Mange <[email protected]>

* clean jsonnet file

Signed-off-by: Thibault Mange <[email protected]>

* add dashboard links and graphs

Signed-off-by: Thibault Mange <[email protected]>

* fix

Signed-off-by: Thibault Mange <[email protected]>

* fix cpu

Signed-off-by: Thibault Mange <[email protected]>

* fix legend

Signed-off-by: Thibault Mange <[email protected]>

---------

Signed-off-by: Thibault Mange <[email protected]>
  • Loading branch information
thibaultmg authored Jul 28, 2023
1 parent 91d48ce commit 0daf607
Show file tree
Hide file tree
Showing 5 changed files with 2,233 additions and 8 deletions.
218 changes: 218 additions & 0 deletions observability/dashboards/observatorium-gubernator.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet';
local template = import 'grafonnet/template.libsonnet';

function() {
local panel(title, description='', unit='short') =
g.panel(title) {
description: description,
fill: 1,
fillGradient: 0,
linewidth: 1,
span: 0,
stack: true,
yaxes: g.yaxes(unit),
},

local datasourcesRegex = '/^rhobs.*|telemeter-prod-01-prometheus|app-sre-stage-01-prometheus/',
local labelMatchers = {
ns: 'namespace="$namespace"',
job: 'job="observatorium-gubernator"',
nsAndJob: std.join(', ', [self.ns, self.job]),
pod: 'pod=~"observatorium-gubernator.*"',
container: 'container="gubernator"',
},
local intervalTemplate =
template.interval(
'interval',
'5m,10m,30m,1h,6h,12h,auto',
label='interval',
current='5m',
),

dashboard:: {
data:
g.dashboard('Observatorium - Gubernator')
.addTemplate('namespace', 'gubernator_check_counter', 'namespace')
.addRow(
g.row('GetRateLimits API')
.addPanel(
panel('Requests', 'Rate of gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetRateLimits"}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Errors', 'Rate of failed gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetRateLimits", status="failed"}[$interval]))' % labelMatchers,
'{{status}} {{job}}',
)
)
.addPanel(
panel('Latencies', 'Latency of gRPC requests to the API per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_grpc_request_duration{%(nsAndJob)s, method=~".*/GetRateLimits"}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
.addPanel(
panel('Over Limit requests rate', 'Rate of requests that resulted in rate limiting (over the limit) per second', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_over_limit_counter{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
)
.addRow(
g.row('GetPeerRateLimits API')
.addPanel(
panel('Requests', 'Rate of gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetPeerRateLimits"}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Errors', 'Rate of failed gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetPeerRateLimits", status="failed"}[$interval]))' % labelMatchers,
'{{status}} {{job}}',
)
)
.addPanel(
panel('Latencies', 'Latency of gRPC requests to the API per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_grpc_request_duration{%(nsAndJob)s, method=~".*/GetPeerRateLimits"}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
)
.addRow(
g.row('Queues')
.addPanel(
panel('getRateLimitsBatch queue length', 'The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer.', '') +
g.queryPanel(
'sum by(job) (rate(gubernator_queue_length{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('GetRateLimit queue length', 'The number of GetRateLimit requests queued up in GubernatorPool workers.', '') +
g.queryPanel(
'sum by(job) (rate(gubernator_pool_queue_length{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
)
.addRow(
g.row('Cache')
.addPanel(
panel('Requests', 'Rate of cache requests per second', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Misses', 'Rate of cache misses per second', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s, type="miss"}[$interval])) / sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Size', 'The number of items in LRU Cache which holds the rate limits.', '') +
g.queryPanel(
'sum by(job) (gubernator_cache_size{%(nsAndJob)s})' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Unexpired evictions', 'Rate of cache items which were evicted while unexpired per second.', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_unexpired_evictions_count{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
)
.addRow(
g.row('Other latencies')
.addPanel(
panel('Batch', 'Latency of batch send operations to a remote peer per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_batch_send_duration{%(nsAndJob)s}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
.addPanel(
panel('Broadcast', 'Latency of of GLOBAL broadcasts to peers per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_broadcast_durations{%(nsAndJob)s}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
.addPanel(
panel('Async', 'Latency of of GLOBAL async sends per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_async_durations{%(nsAndJob)s}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
)
.addRow(
g.row('Resources usage')
.addPanel(
panel('Memory Usage', 'Memory usage of the Gubernator process', 'MiB') +
g.queryPanel(
'container_memory_working_set_bytes{%(container)s, %(pod)s, %(ns)s} / 1024^2' % labelMatchers,
'memory usage system {{pod}}',
)
)
.addPanel(
panel('CPU Usage', 'CPU usage of the Gubernator process', 'percent') +
g.queryPanel(
'rate(container_cpu_usage_seconds_total{%(container)s, %(pod)s, %(ns)s}[$interval]) * 100' % labelMatchers,
'cpu usage system {{pod}}',
)
)
.addPanel(
panel('Pod/Container Restarts', 'Number of times the pod/container has restarted', '') +
g.queryPanel(
'sum by (pod) (kube_pod_container_status_restarts_total{%(container)s, %(pod)s, %(ns)s})' % labelMatchers,
'pod restart count {{pod}}',
)
)
.addPanel(
panel('Network Usage', 'Network usage of the Gubernator process', 'binBps') +
g.queryPanel(
[
'sum by (pod) (rate(container_network_receive_bytes_total{%(pod)s, %(ns)s}[$interval]))' % labelMatchers,
'sum by (pod) (rate(container_network_transmit_bytes_total{%(pod)s, %(ns)s}[$interval]))' % labelMatchers,
],
[
'network traffic in {{pod}}',
'network traffic out {{pod}}',
]
)
)
) + {
templating+: {
list: [
if variable.name == 'datasource'
then variable { regex: datasourcesRegex }
else variable
for variable in super.list
] + [intervalTemplate],
},
},
},
apiVersion: 'v1',
kind: 'ConfigMap',
metadata: {
name: 'grafana-dashboard-obervatorium-gubernator',
},
data: {
'rhobs-instance-obervatorium-gubernator.json': std.manifestJsonEx($.dashboard.data, ' '),
},
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ function() {
template.new(
name='job',
datasource='$datasource',
query='label_values(up{namespace="$namespace", job=~"observatorium-thanos-.*|observatorium-ruler-query.*"}, job)',
query='label_values(up{namespace="$namespace", job=~"observatorium-thanos-.*|observatorium-ruler-query.*|observatorium-gubernator"}, job)',
label='job',
allValues='.+',
current='',
Expand Down Expand Up @@ -618,6 +618,7 @@ function() {
g.row('Gubernator Overview')
.addPanel(
g.panel('Rate of gRPC requests', 'Shows count of gRPC requests to gubernator') +
g.addDashboardLink(thanos.gubernator.dashboard.title) +
g.queryPanel(
[
'sum(rate(gubernator_grpc_request_counts{namespace="$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)',
Expand All @@ -630,6 +631,7 @@ function() {
)
.addPanel(
g.panel('Rate of errors in gRPC requests', 'Shows count of errors in gRPC requests to gubernator') { span:: 0 } +
g.addDashboardLink(thanos.gubernator.dashboard.title) +
g.queryPanel(
[
'sum(rate(gubernator_grpc_request_counts{status="failed",namespace="$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)',
Expand All @@ -642,6 +644,7 @@ function() {
)
.addPanel(
g.panel('Duration of gRPC requests', 'Shows duration of gRPC requests to gubernator') +
g.addDashboardLink(thanos.gubernator.dashboard.title) +
g.queryPanel(
[
'gubernator_grpc_request_duration{quantile="0.99", namespace="$namespace",job=~"$job"}',
Expand All @@ -656,6 +659,7 @@ function() {
)
.addPanel(
g.panel('Local queue of rate checks', 'Shows the number of rate checks in the local queue') +
g.addDashboardLink(thanos.gubernator.dashboard.title) +
g.queryPanel(
[
'gubernator_pool_queue_length{namespace="$namespace",job=~"$job"}',
Expand All @@ -667,6 +671,7 @@ function() {
)
.addPanel(
g.panel('Peer queue of rate checks', 'Shows the number of rate checks in the peer queue') +
g.addDashboardLink(thanos.gubernator.dashboard.title) +
g.queryPanel(
[
'gubernator_queue_length{namespace="$namespace",job=~"$job"}',
Expand All @@ -677,17 +682,21 @@ function() {
) { span:: 0 }
)
.addPanel(
g.addDashboardLink(thanos.gubernator.dashboard.title) +
memoryUsagePanel(thanos.gubernator.dashboard.container, thanos.gubernator.dashboard.pod) +
{ yaxes: g.yaxes('bytes') } +
g.stack
)
.addPanel(
g.addDashboardLink(thanos.gubernator.dashboard.title) +
cpuUsagePanel(thanos.gubernator.dashboard.container, thanos.gubernator.dashboard.pod)
)
.addPanel(
g.addDashboardLink(thanos.gubernator.dashboard.title) +
podRestartPanel(thanos.gubernator.dashboard.container, thanos.gubernator.dashboard.pod)
)
.addPanel(
g.addDashboardLink(thanos.gubernator.dashboard.title) +
networkUsagePanel(thanos.gubernator.dashboard.pod) +
g.stack +
{ yaxes: g.yaxes('binBps') }
Expand Down
3 changes: 2 additions & 1 deletion observability/grafana.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ local dashboards =
{ 'grafana-dashboard-tracing-otel.configmap': (import 'dashboards/opentelemetry.libsonnet')(obsDatasource, obsTraces) } +
{ 'grafana-dashboard-tracing-jaeger.configmap': (import 'dashboards/tracing.libsonnet')(obsDatasource, obsTraces) } +
{ 'grafana-dashboard-rhobs-instance-utilization-overview.configmap': (import 'dashboards/rhobs-instance-utilization-overview.libsonnet')() } +
{ 'grafana-dashboard-rules-objstore.configmap': (import 'dashboards/rules-objstore.libsonnet')() };
{ 'grafana-dashboard-rules-objstore.configmap': (import 'dashboards/rules-objstore.libsonnet')() } +
{ 'grafana-dashboard-observatorium-gubernator.configmap': (import 'dashboards/observatorium-gubernator.libsonnet')() };
{
[name]: dashboards[name] {
metadata+: {
Expand Down
Loading

0 comments on commit 0daf607

Please sign in to comment.