From 0daf607b88cfa99cd1678acdda76d6aaf72e85fb Mon Sep 17 00:00:00 2001 From: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> Date: Fri, 28 Jul 2023 16:26:42 +0200 Subject: [PATCH] Add gubernator dashboard (#563) * wip Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * wip Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * clean jsonnet file Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * add dashboard links and graphs Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * fix Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * fix cpu Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> * fix legend Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> --------- Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> --- .../observatorium-gubernator.libsonnet | 218 ++ ...bs-instance-utilization-overview.libsonnet | 11 +- observability/grafana.jsonnet | 3 +- ...rd-observatorium-gubernator.configmap.yaml | 1967 +++++++++++++++++ ...stance-utilization-overview.configmap.yaml | 42 +- 5 files changed, 2233 insertions(+), 8 deletions(-) create mode 100644 observability/dashboards/observatorium-gubernator.libsonnet create mode 100644 resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml diff --git a/observability/dashboards/observatorium-gubernator.libsonnet b/observability/dashboards/observatorium-gubernator.libsonnet new file mode 100644 index 0000000000..540f77b880 --- /dev/null +++ b/observability/dashboards/observatorium-gubernator.libsonnet @@ -0,0 +1,218 @@ +local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet'; +local template = import 'grafonnet/template.libsonnet'; + +function() { + local panel(title, description='', unit='short') = + g.panel(title) { + description: description, + fill: 1, + fillGradient: 0, + linewidth: 1, + span: 0, + stack: true, + yaxes: g.yaxes(unit), + }, + + local datasourcesRegex = '/^rhobs.*|telemeter-prod-01-prometheus|app-sre-stage-01-prometheus/', + local labelMatchers = { + ns: 'namespace="$namespace"', + job: 'job="observatorium-gubernator"', + nsAndJob: std.join(', ', [self.ns, self.job]), + pod: 'pod=~"observatorium-gubernator.*"', + container: 'container="gubernator"', + }, + local intervalTemplate = + template.interval( + 'interval', + '5m,10m,30m,1h,6h,12h,auto', + label='interval', + current='5m', + ), + + dashboard:: { + data: + g.dashboard('Observatorium - Gubernator') + .addTemplate('namespace', 'gubernator_check_counter', 'namespace') + .addRow( + g.row('GetRateLimits API') + .addPanel( + panel('Requests', 'Rate of gRPC requests to the API per second', 'reqps') + + g.queryPanel( + 'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetRateLimits"}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + .addPanel( + panel('Errors', 'Rate of failed gRPC requests to the API per second', 'reqps') + + g.queryPanel( + 'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetRateLimits", status="failed"}[$interval]))' % labelMatchers, + '{{status}} {{job}}', + ) + ) + .addPanel( + panel('Latencies', 'Latency of gRPC requests to the API per percentiles', 'ms') + + g.queryPanel( + 'avg by(quantile, job) (gubernator_grpc_request_duration{%(nsAndJob)s, method=~".*/GetRateLimits"}) * 1000' % labelMatchers, + '{{quantile}}th percentile', + ) + ) + .addPanel( + panel('Over Limit requests rate', 'Rate of requests that resulted in rate limiting (over the limit) per second', 'reqps') + + g.queryPanel( + 'sum by(job) (rate(gubernator_over_limit_counter{%(nsAndJob)s}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + ) + .addRow( + g.row('GetPeerRateLimits API') + .addPanel( + panel('Requests', 'Rate of gRPC requests to the API per second', 'reqps') + + g.queryPanel( + 'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetPeerRateLimits"}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + .addPanel( + panel('Errors', 'Rate of failed gRPC requests to the API per second', 'reqps') + + g.queryPanel( + 'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetPeerRateLimits", status="failed"}[$interval]))' % labelMatchers, + '{{status}} {{job}}', + ) + ) + .addPanel( + panel('Latencies', 'Latency of gRPC requests to the API per percentiles', 'ms') + + g.queryPanel( + 'avg by(quantile, job) (gubernator_grpc_request_duration{%(nsAndJob)s, method=~".*/GetPeerRateLimits"}) * 1000' % labelMatchers, + '{{quantile}}th percentile', + ) + ) + ) + .addRow( + g.row('Queues') + .addPanel( + panel('getRateLimitsBatch queue length', 'The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer.', '') + + g.queryPanel( + 'sum by(job) (rate(gubernator_queue_length{%(nsAndJob)s}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + .addPanel( + panel('GetRateLimit queue length', 'The number of GetRateLimit requests queued up in GubernatorPool workers.', '') + + g.queryPanel( + 'sum by(job) (rate(gubernator_pool_queue_length{%(nsAndJob)s}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + ) + .addRow( + g.row('Cache') + .addPanel( + panel('Requests', 'Rate of cache requests per second', 'reqps') + + g.queryPanel( + 'sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + .addPanel( + panel('Misses', 'Rate of cache misses per second', 'reqps') + + g.queryPanel( + 'sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s, type="miss"}[$interval])) / sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + .addPanel( + panel('Size', 'The number of items in LRU Cache which holds the rate limits.', '') + + g.queryPanel( + 'sum by(job) (gubernator_cache_size{%(nsAndJob)s})' % labelMatchers, + '{{job}}', + ) + ) + .addPanel( + panel('Unexpired evictions', 'Rate of cache items which were evicted while unexpired per second.', 'reqps') + + g.queryPanel( + 'sum by(job) (rate(gubernator_unexpired_evictions_count{%(nsAndJob)s}[$interval]))' % labelMatchers, + '{{job}}', + ) + ) + ) + .addRow( + g.row('Other latencies') + .addPanel( + panel('Batch', 'Latency of batch send operations to a remote peer per percentiles', 'ms') + + g.queryPanel( + 'avg by(quantile, job) (gubernator_batch_send_duration{%(nsAndJob)s}) * 1000' % labelMatchers, + '{{quantile}}th percentile', + ) + ) + .addPanel( + panel('Broadcast', 'Latency of of GLOBAL broadcasts to peers per percentiles', 'ms') + + g.queryPanel( + 'avg by(quantile, job) (gubernator_broadcast_durations{%(nsAndJob)s}) * 1000' % labelMatchers, + '{{quantile}}th percentile', + ) + ) + .addPanel( + panel('Async', 'Latency of of GLOBAL async sends per percentiles', 'ms') + + g.queryPanel( + 'avg by(quantile, job) (gubernator_async_durations{%(nsAndJob)s}) * 1000' % labelMatchers, + '{{quantile}}th percentile', + ) + ) + ) + .addRow( + g.row('Resources usage') + .addPanel( + panel('Memory Usage', 'Memory usage of the Gubernator process', 'MiB') + + g.queryPanel( + 'container_memory_working_set_bytes{%(container)s, %(pod)s, %(ns)s} / 1024^2' % labelMatchers, + 'memory usage system {{pod}}', + ) + ) + .addPanel( + panel('CPU Usage', 'CPU usage of the Gubernator process', 'percent') + + g.queryPanel( + 'rate(container_cpu_usage_seconds_total{%(container)s, %(pod)s, %(ns)s}[$interval]) * 100' % labelMatchers, + 'cpu usage system {{pod}}', + ) + ) + .addPanel( + panel('Pod/Container Restarts', 'Number of times the pod/container has restarted', '') + + g.queryPanel( + 'sum by (pod) (kube_pod_container_status_restarts_total{%(container)s, %(pod)s, %(ns)s})' % labelMatchers, + 'pod restart count {{pod}}', + ) + ) + .addPanel( + panel('Network Usage', 'Network usage of the Gubernator process', 'binBps') + + g.queryPanel( + [ + 'sum by (pod) (rate(container_network_receive_bytes_total{%(pod)s, %(ns)s}[$interval]))' % labelMatchers, + 'sum by (pod) (rate(container_network_transmit_bytes_total{%(pod)s, %(ns)s}[$interval]))' % labelMatchers, + ], + [ + 'network traffic in {{pod}}', + 'network traffic out {{pod}}', + ] + ) + ) + ) + { + templating+: { + list: [ + if variable.name == 'datasource' + then variable { regex: datasourcesRegex } + else variable + for variable in super.list + ] + [intervalTemplate], + }, + }, + }, + apiVersion: 'v1', + kind: 'ConfigMap', + metadata: { + name: 'grafana-dashboard-obervatorium-gubernator', + }, + data: { + 'rhobs-instance-obervatorium-gubernator.json': std.manifestJsonEx($.dashboard.data, ' '), + }, +} diff --git a/observability/dashboards/rhobs-instance-utilization-overview.libsonnet b/observability/dashboards/rhobs-instance-utilization-overview.libsonnet index 2792b9391a..017505f1ba 100644 --- a/observability/dashboards/rhobs-instance-utilization-overview.libsonnet +++ b/observability/dashboards/rhobs-instance-utilization-overview.libsonnet @@ -32,7 +32,7 @@ function() { template.new( name='job', datasource='$datasource', - query='label_values(up{namespace="$namespace", job=~"observatorium-thanos-.*|observatorium-ruler-query.*"}, job)', + query='label_values(up{namespace="$namespace", job=~"observatorium-thanos-.*|observatorium-ruler-query.*|observatorium-gubernator"}, job)', label='job', allValues='.+', current='', @@ -618,6 +618,7 @@ function() { g.row('Gubernator Overview') .addPanel( g.panel('Rate of gRPC requests', 'Shows count of gRPC requests to gubernator') + + g.addDashboardLink(thanos.gubernator.dashboard.title) + g.queryPanel( [ 'sum(rate(gubernator_grpc_request_counts{namespace="$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)', @@ -630,6 +631,7 @@ function() { ) .addPanel( g.panel('Rate of errors in gRPC requests', 'Shows count of errors in gRPC requests to gubernator') { span:: 0 } + + g.addDashboardLink(thanos.gubernator.dashboard.title) + g.queryPanel( [ 'sum(rate(gubernator_grpc_request_counts{status="failed",namespace="$namespace",job=~"$job"}[$__rate_interval])) by (namespace,job,pod)', @@ -642,6 +644,7 @@ function() { ) .addPanel( g.panel('Duration of gRPC requests', 'Shows duration of gRPC requests to gubernator') + + g.addDashboardLink(thanos.gubernator.dashboard.title) + g.queryPanel( [ 'gubernator_grpc_request_duration{quantile="0.99", namespace="$namespace",job=~"$job"}', @@ -656,6 +659,7 @@ function() { ) .addPanel( g.panel('Local queue of rate checks', 'Shows the number of rate checks in the local queue') + + g.addDashboardLink(thanos.gubernator.dashboard.title) + g.queryPanel( [ 'gubernator_pool_queue_length{namespace="$namespace",job=~"$job"}', @@ -667,6 +671,7 @@ function() { ) .addPanel( g.panel('Peer queue of rate checks', 'Shows the number of rate checks in the peer queue') + + g.addDashboardLink(thanos.gubernator.dashboard.title) + g.queryPanel( [ 'gubernator_queue_length{namespace="$namespace",job=~"$job"}', @@ -677,17 +682,21 @@ function() { ) { span:: 0 } ) .addPanel( + g.addDashboardLink(thanos.gubernator.dashboard.title) + memoryUsagePanel(thanos.gubernator.dashboard.container, thanos.gubernator.dashboard.pod) + { yaxes: g.yaxes('bytes') } + g.stack ) .addPanel( + g.addDashboardLink(thanos.gubernator.dashboard.title) + cpuUsagePanel(thanos.gubernator.dashboard.container, thanos.gubernator.dashboard.pod) ) .addPanel( + g.addDashboardLink(thanos.gubernator.dashboard.title) + podRestartPanel(thanos.gubernator.dashboard.container, thanos.gubernator.dashboard.pod) ) .addPanel( + g.addDashboardLink(thanos.gubernator.dashboard.title) + networkUsagePanel(thanos.gubernator.dashboard.pod) + g.stack + { yaxes: g.yaxes('binBps') } diff --git a/observability/grafana.jsonnet b/observability/grafana.jsonnet index cce320fa21..d9e1e2797b 100644 --- a/observability/grafana.jsonnet +++ b/observability/grafana.jsonnet @@ -75,7 +75,8 @@ local dashboards = { 'grafana-dashboard-tracing-otel.configmap': (import 'dashboards/opentelemetry.libsonnet')(obsDatasource, obsTraces) } + { 'grafana-dashboard-tracing-jaeger.configmap': (import 'dashboards/tracing.libsonnet')(obsDatasource, obsTraces) } + { 'grafana-dashboard-rhobs-instance-utilization-overview.configmap': (import 'dashboards/rhobs-instance-utilization-overview.libsonnet')() } + - { 'grafana-dashboard-rules-objstore.configmap': (import 'dashboards/rules-objstore.libsonnet')() }; + { 'grafana-dashboard-rules-objstore.configmap': (import 'dashboards/rules-objstore.libsonnet')() } + + { 'grafana-dashboard-observatorium-gubernator.configmap': (import 'dashboards/observatorium-gubernator.libsonnet')() }; { [name]: dashboards[name] { metadata+: { diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml new file mode 100644 index 0000000000..5d64083e2d --- /dev/null +++ b/resources/observability/grafana/observatorium/grafana-dashboard-observatorium-gubernator.configmap.yaml @@ -0,0 +1,1967 @@ +apiVersion: v1 +data: + rhobs-instance-obervatorium-gubernator.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of gRPC requests to the API per second", + "fill": 1, + "fillGradient": 0, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetRateLimits\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of failed gRPC requests to the API per second", + "fill": 1, + "fillGradient": 0, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetRateLimits\", status=\"failed\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{status}} {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latency of gRPC requests to the API per percentiles", + "fill": 1, + "fillGradient": 0, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "avg by(quantile, job) (gubernator_grpc_request_duration{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetRateLimits\"}) * 1000", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}}th percentile", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Latencies", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of requests that resulted in rate limiting (over the limit) per second", + "fill": 1, + "fillGradient": 0, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(job) (rate(gubernator_over_limit_counter{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Over Limit requests rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "GetRateLimits API", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of gRPC requests to the API per second", + "fill": 1, + "fillGradient": 0, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetPeerRateLimits\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of failed gRPC requests to the API per second", + "fill": 1, + "fillGradient": 0, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, method) (rate(gubernator_grpc_request_counts{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetPeerRateLimits\", status=\"failed\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{status}} {{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Errors", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latency of gRPC requests to the API per percentiles", + "fill": 1, + "fillGradient": 0, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "avg by(quantile, job) (gubernator_grpc_request_duration{namespace=\"$namespace\", job=\"observatorium-gubernator\", method=~\".*/GetPeerRateLimits\"}) * 1000", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}}th percentile", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Latencies", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "GetPeerRateLimits API", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer.", + "fill": 1, + "fillGradient": 0, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(job) (rate(gubernator_queue_length{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "getRateLimitsBatch queue length", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "The number of GetRateLimit requests queued up in GubernatorPool workers.", + "fill": 1, + "fillGradient": 0, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(job) (rate(gubernator_pool_queue_length{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "GetRateLimit queue length", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Queues", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of cache requests per second", + "fill": 1, + "fillGradient": 0, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(job) (rate(gubernator_cache_access_count{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of cache misses per second", + "fill": 1, + "fillGradient": 0, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(job) (rate(gubernator_cache_access_count{namespace=\"$namespace\", job=\"observatorium-gubernator\", type=\"miss\"}[$interval])) / sum by(job) (rate(gubernator_cache_access_count{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Misses", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "The number of items in LRU Cache which holds the rate limits.", + "fill": 1, + "fillGradient": 0, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(job) (gubernator_cache_size{namespace=\"$namespace\", job=\"observatorium-gubernator\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Size", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Rate of cache items which were evicted while unexpired per second.", + "fill": 1, + "fillGradient": 0, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(job) (rate(gubernator_unexpired_evictions_count{namespace=\"$namespace\", job=\"observatorium-gubernator\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Unexpired evictions", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Cache", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latency of batch send operations to a remote peer per percentiles", + "fill": 1, + "fillGradient": 0, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "avg by(quantile, job) (gubernator_batch_send_duration{namespace=\"$namespace\", job=\"observatorium-gubernator\"}) * 1000", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}}th percentile", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Batch", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latency of of GLOBAL broadcasts to peers per percentiles", + "fill": 1, + "fillGradient": 0, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "avg by(quantile, job) (gubernator_broadcast_durations{namespace=\"$namespace\", job=\"observatorium-gubernator\"}) * 1000", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}}th percentile", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Broadcast", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Latency of of GLOBAL async sends per percentiles", + "fill": 1, + "fillGradient": 0, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "avg by(quantile, job) (gubernator_async_durations{namespace=\"$namespace\", job=\"observatorium-gubernator\"}) * 1000", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{quantile}}th percentile", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Async", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Other latencies", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Memory usage of the Gubernator process", + "fill": 1, + "fillGradient": 0, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "container_memory_working_set_bytes{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"} / 1024^2", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory usage system {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "MiB", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "CPU usage of the Gubernator process", + "fill": 1, + "fillGradient": 0, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(container_cpu_usage_seconds_total{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "cpu usage system {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Number of times the pod/container has restarted", + "fill": 1, + "fillGradient": 0, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (pod) (kube_pod_container_status_restarts_total{container=\"gubernator\", pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "pod restart count {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Pod/Container Restarts", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "Network usage of the Gubernator process", + "fill": 1, + "fillGradient": 0, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (pod) (rate(container_network_receive_bytes_total{pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "network traffic in {{pod}}", + "legendLink": null, + "step": 10 + }, + { + "expr": "sum by (pod) (rate(container_network_transmit_bytes_total{pod=~\"observatorium-gubernator.*\", namespace=\"$namespace\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "network traffic out {{pod}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "binBps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Resources usage", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "/^rhobs.*|telemeter-prod-01-prometheus|app-sre-stage-01-prometheus/", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(gubernator_check_counter, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "auto": true, + "auto_count": 300, + "auto_min": "10s", + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "query": "5m,10m,30m,1h,6h,12h", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Observatorium - Gubernator", + "uid": "", + "version": 0 + } +kind: ConfigMap +metadata: + annotations: + grafana-folder: /grafana-dashboard-definitions/Observatorium + labels: + grafana_dashboard: "true" + name: grafana-dashboard-obervatorium-gubernator diff --git a/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml b/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml index dd29d2697d..bdd7526ca7 100644 --- a/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml +++ b/resources/observability/grafana/observatorium/grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml @@ -6944,7 +6944,13 @@ data: "lines": true, "linewidth": 0, "links": [ - + { + "dashboard": "Observatorium - Gubernator", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - Gubernator", + "type": "dashboard" + } ], "nullPointMode": "null as zero", "percentage": false, @@ -7030,7 +7036,13 @@ data: "lines": true, "linewidth": 0, "links": [ - + { + "dashboard": "Observatorium - Gubernator", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - Gubernator", + "type": "dashboard" + } ], "nullPointMode": "null as zero", "percentage": false, @@ -7116,7 +7128,13 @@ data: "lines": true, "linewidth": 1, "links": [ - + { + "dashboard": "Observatorium - Gubernator", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - Gubernator", + "type": "dashboard" + } ], "nullPointMode": "null as zero", "percentage": false, @@ -7210,7 +7228,13 @@ data: "lines": true, "linewidth": 1, "links": [ - + { + "dashboard": "Observatorium - Gubernator", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - Gubernator", + "type": "dashboard" + } ], "nullPointMode": "null as zero", "percentage": false, @@ -7296,7 +7320,13 @@ data: "lines": true, "linewidth": 1, "links": [ - + { + "dashboard": "Observatorium - Gubernator", + "includeVars": true, + "keepTime": true, + "title": "Observatorium - Gubernator", + "type": "dashboard" + } ], "nullPointMode": "null as zero", "percentage": false, @@ -8283,7 +8313,7 @@ data: "options": [ ], - "query": "label_values(up{namespace=\"$namespace\", job=~\"observatorium-thanos-.*|observatorium-ruler-query.*\"}, job)", + "query": "label_values(up{namespace=\"$namespace\", job=~\"observatorium-thanos-.*|observatorium-ruler-query.*|observatorium-gubernator\"}, job)", "refresh": 2, "regex": "", "sort": 1,