diff --git a/charts/victoria-logs-single/files/rules/generated/vlogs.yaml b/charts/victoria-logs-single/files/rules/generated/vlogs.yaml
index d09a6ab71..6629948d1 100644
--- a/charts/victoria-logs-single/files/rules/generated/vlogs.yaml
+++ b/charts/victoria-logs-single/files/rules/generated/vlogs.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
concurrency: 2
condition: '{{ true }}'
interval: 30s
diff --git a/charts/victoria-metrics-k8s-stack/CHANGELOG.md b/charts/victoria-metrics-k8s-stack/CHANGELOG.md
index e3f20248f..9b16e4e69 100644
--- a/charts/victoria-metrics-k8s-stack/CHANGELOG.md
+++ b/charts/victoria-metrics-k8s-stack/CHANGELOG.md
@@ -1,6 +1,9 @@
## Next release
-- TODO
+**Update note**: This release contains breaking change. `.Values.externalVM` was renamed to `.Values.external.vm`
+
+- add `.Values.external.grafana.host` to configure grafana host for alerts, when `.Values.grafana.enabled: false`
+- rename `.Values.externalVM` to `.Values.external.vm` for consistency
## 0.34.0
diff --git a/charts/victoria-metrics-k8s-stack/README.md b/charts/victoria-metrics-k8s-stack/README.md
index b1fd9990c..83f706050 100644
--- a/charts/victoria-metrics-k8s-stack/README.md
+++ b/charts/victoria-metrics-k8s-stack/README.md
@@ -845,7 +845,7 @@ perReplica: false
type: prometheus
- isDefault: false
name: VictoriaMetrics (DS)
- type: victoriametrics-datasource
+ type: victoriametrics-metrics-datasource
@@ -1326,7 +1326,18 @@ vmsingle:
- externalVM |
+ external.grafana |
+ object |
+
+host: grafana.external.host
+
+
+ |
+ External Grafana host
+ |
+
+
+ external.vm |
object |
read:
@@ -2475,7 +2486,7 @@ unauthorizedUserAccessSpec:
|
- Full spec for VMAuth CRD. Allowed values described here It’s possible to use given below predefined variables in spec: * {{ .vm.read }} - parsed vmselect, vmsingle or externalVM.read URL * {{ .vm.write }} - parsed vminsert, vmsingle or externalVM.write URL
+ | Full spec for VMAuth CRD. Allowed values described here It’s possible to use given below predefined variables in spec: * {{ .vm.read }} - parsed vmselect, vmsingle or external.vm.read URL * {{ .vm.write }} - parsed vminsert, vmsingle or external.vm.write URL
|
diff --git a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/controller-manager.yaml b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/controller-manager.yaml
index 52485b5e6..65a170c86 100644
--- a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/controller-manager.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/controller-manager.yaml
@@ -232,8 +232,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
- expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, url, le))
- legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
+ expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, le))
+ legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Post Request Latency 99th Quantile
type: timeseries
- datasource:
@@ -268,8 +268,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
- expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, url, le))
- legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
+ expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, le))
+ legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Get Request Latency 99th Quantile
type: timeseries
- datasource:
diff --git a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/kubelet.yaml b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/kubelet.yaml
index 0f279b482..6364d9162 100644
--- a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/kubelet.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/kubelet.yaml
@@ -722,8 +722,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
- expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster",job="kubelet", metrics_path="/metrics", instance=~"$instance"}[$__rate_interval])) by (instance, verb, url, le))
- legendFormat: '{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
+ expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster",job="kubelet", metrics_path="/metrics", instance=~"$instance"}[$__rate_interval])) by (instance, verb, le))
+ legendFormat: '{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}}'
title: Request duration 99th quantile
type: timeseries
- datasource:
diff --git a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/proxy.yaml b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/proxy.yaml
index 0e9539716..d6dc8fae7 100644
--- a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/proxy.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/proxy.yaml
@@ -268,8 +268,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
- expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy",instance=~"$instance",verb="POST"}[$__rate_interval])) by (verb, url, le))
- legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
+ expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy",instance=~"$instance",verb="POST"}[$__rate_interval])) by (verb, le))
+ legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Post Request Latency 99th Quantile
type: timeseries
- datasource:
@@ -304,8 +304,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
- expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, url, le))
- legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
+ expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, le))
+ legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Get Request Latency 99th Quantile
type: timeseries
- datasource:
diff --git a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/scheduler.yaml b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/scheduler.yaml
index 594dc77fd..f3eb26f18 100644
--- a/charts/victoria-metrics-k8s-stack/files/dashboards/generated/scheduler.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/dashboards/generated/scheduler.yaml
@@ -226,8 +226,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
- expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, url, le))
- legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
+ expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, le))
+ legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Post Request Latency 99th Quantile
type: timeseries
- datasource:
@@ -262,8 +262,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
- expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, url, le))
- legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
+ expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, le))
+ legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Get Request Latency 99th Quantile
type: timeseries
- datasource:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml
index 1f1d50856..fb0e35823 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.alertmanager).enabled }}'
name: alertmanager.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/etcd.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/etcd.yaml
index eb52d9fa5..b967a3e9a 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/etcd.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/etcd.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeEtcd).enabled }}'
name: etcd
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml
index 3f365bce7..9287bee45 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: general.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_limits.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_limits.yaml
index c6448b39c..0d86579bd 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_limits.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_limits.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_cpu_limits
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_requests.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_requests.yaml
index be8c36d4b..ad6b82c51 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_requests.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_requests.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_cpu_requests
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_usage_seconds_total.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_usage_seconds_total.yaml
index 81efacb10..e25c26e0c 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_usage_seconds_total.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_usage_seconds_total.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_cpu_usage_seconds_total
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_cache.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_cache.yaml
index 281048576..63deb6465 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_cache.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_cache.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_cache
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_limits.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_limits.yaml
index d803ca0d1..0819aaa36 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_limits.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_limits.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_limits
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_requests.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_requests.yaml
index 3f56c7cda..bc2ef3866 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_requests.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_requests.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_requests
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_rss.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_rss.yaml
index a80342fad..da04872e6 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_rss.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_rss.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_rss
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_swap.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_swap.yaml
index 6aa8a3a93..93797ab90 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_swap.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_swap.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_swap
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_working_set_bytes.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_working_set_bytes.yaml
index 6037850ff..9f5f377bc 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_working_set_bytes.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_working_set_bytes.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_working_set_bytes
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.pod_owner.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.pod_owner.yaml
index f17ebfa3e..dcbf6b16b 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.pod_owner.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.pod_owner.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.pod_owner
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-availability.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-availability.rules.yaml
index 120ff7ef0..35485dfbc 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-availability.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-availability.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeApiServer).enabled }}'
interval: 3m
name: kube-apiserver-availability.rules
@@ -38,7 +38,7 @@ rules:
# write too slow
sum by ({{ $clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"})
) +
(
# read too slow
@@ -46,14 +46,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"})
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"})
+
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"})
)
) +
# errors
@@ -72,14 +72,14 @@ rules:
(
# too slow
(
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"})
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"})
+
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"})
)
+
# errors
@@ -97,7 +97,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
- sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
+ sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"})
)
+
# errors
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-burnrate.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-burnrate.rules.yaml
index 8c5b736fa..5d8725d3d 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-burnrate.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-burnrate.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeApiServer).enabled }}'
name: kube-apiserver-burnrate.rules
rules:
@@ -14,14 +14,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1d]))
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1d]))
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1d]))
)
)
+
@@ -42,14 +42,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1h]))
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1h]))
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1h]))
)
)
+
@@ -70,14 +70,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[2h]))
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[2h]))
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[2h]))
)
)
+
@@ -98,14 +98,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[30m]))
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[30m]))
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[30m]))
)
)
+
@@ -126,14 +126,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[3d]))
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[3d]))
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[3d]))
)
)
+
@@ -154,14 +154,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[5m]))
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[5m]))
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[5m]))
)
)
+
@@ -182,14 +182,14 @@ rules:
-
(
(
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[6h]))
or
vector(0)
)
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[6h]))
+
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[6h]))
)
)
+
@@ -208,7 +208,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
-
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1d]))
)
+
sum by ({{ $clusterLabel }}) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
@@ -225,7 +225,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
-
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1h]))
)
+
sum by ({{ $clusterLabel }}) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
@@ -242,7 +242,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
-
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[2h]))
)
+
sum by ({{ $clusterLabel }}) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
@@ -259,7 +259,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
-
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[30m]))
)
+
sum by ({{ $clusterLabel }}) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
@@ -276,7 +276,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
-
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[3d]))
)
+
sum by ({{ $clusterLabel }}) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
@@ -293,7 +293,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
-
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[5m]))
)
+
sum by ({{ $clusterLabel }}) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
@@ -310,7 +310,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
-
- sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
+ sum by ({{ $clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[6h]))
)
+
sum by ({{ $clusterLabel }}) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-histogram.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-histogram.rules.yaml
index 97be8e628..bb54c6b65 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-histogram.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-histogram.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeApiServer).enabled }}'
name: kube-apiserver-histogram.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-slos.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-slos.yaml
index 7fb40f7d7..e347939e8 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-slos.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-slos.yaml
@@ -1,13 +1,13 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeApiServer).enabled }}'
name: kube-apiserver-slos
rules:
- alert: KubeAPIErrorBudgetBurn
annotations:
- description: 'The API server is burning too much error budget.'
+ description: 'The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
@@ -22,7 +22,7 @@ rules:
short: 5m
- alert: KubeAPIErrorBudgetBurn
annotations:
- description: 'The API server is burning too much error budget.'
+ description: 'The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
@@ -37,7 +37,7 @@ rules:
short: 30m
- alert: KubeAPIErrorBudgetBurn
annotations:
- description: 'The API server is burning too much error budget.'
+ description: 'The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
@@ -52,7 +52,7 @@ rules:
short: 2h
- alert: KubeAPIErrorBudgetBurn
annotations:
- description: 'The API server is burning too much error budget.'
+ description: 'The API server is burning too much error budget on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-general.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-general.rules.yaml
index d5da7aea6..e539d89ca 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-general.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-general.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kube-prometheus-general.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-node-recording.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-node-recording.rules.yaml
index ac06e16f2..fd373379b 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-node-recording.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-node-recording.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kube-prometheus-node-recording.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-scheduler.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-scheduler.rules.yaml
index 6d7f7109d..d9d98d2c9 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-scheduler.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-scheduler.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeScheduler).enabled }}'
name: kube-scheduler.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-state-metrics.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-state-metrics.yaml
index 6f480d98f..63c395202 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-state-metrics.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-state-metrics.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kube-state-metrics
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubelet.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubelet.rules.yaml
index cc3694128..e35f37705 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubelet.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubelet.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubelet).enabled }}'
name: kubelet.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml
index 7a7727863..14eece60d 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml
@@ -1,13 +1,13 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
- description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
+ description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff") on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodcrashlooping'
summary: 'Pod is crash looping.'
condition: '{{ true }}'
@@ -17,7 +17,7 @@ rules:
severity: warning
- alert: KubePodNotReady
annotations:
- description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.'
+ description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubepodnotready'
summary: 'Pod has been in a non-ready state for more than 15 minutes.'
condition: '{{ true }}'
@@ -34,7 +34,7 @@ rules:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
- description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.'
+ description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch'
summary: 'Deployment generation mismatch due to possible roll-back'
condition: '{{ true }}'
@@ -47,7 +47,7 @@ rules:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
- description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
+ description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch'
summary: 'Deployment has not matched the expected number of replicas.'
condition: '{{ true }}'
@@ -66,7 +66,7 @@ rules:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
- description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes.'
+ description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedeploymentrolloutstuck'
summary: 'Deployment rollout is not progressing.'
condition: '{{ true }}'
@@ -78,7 +78,7 @@ rules:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
- description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
+ description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch'
summary: 'StatefulSet has not matched the expected number of replicas.'
condition: '{{ true }}'
@@ -97,7 +97,7 @@ rules:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
- description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.'
+ description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch'
summary: 'StatefulSet generation mismatch due to possible roll-back'
condition: '{{ true }}'
@@ -110,7 +110,7 @@ rules:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
- description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.'
+ description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout'
summary: 'StatefulSet update has not been rolled out.'
condition: '{{ true }}'
@@ -137,7 +137,7 @@ rules:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
- description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m.'
+ description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15m on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck'
summary: 'DaemonSet rollout is stuck.'
condition: '{{ true }}'
@@ -170,7 +170,7 @@ rules:
severity: warning
- alert: KubeContainerWaiting
annotations:
- description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}").'
+ description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}") on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecontainerwaiting'
summary: 'Pod container waiting longer than 1 hour'
condition: '{{ true }}'
@@ -180,7 +180,7 @@ rules:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
- description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
+ description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetnotscheduled'
summary: 'DaemonSet pods are not scheduled.'
condition: '{{ true }}'
@@ -193,7 +193,7 @@ rules:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
- description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
+ description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubedaemonsetmisscheduled'
summary: 'DaemonSet pods are misscheduled.'
condition: '{{ true }}'
@@ -203,7 +203,7 @@ rules:
severity: warning
- alert: KubeJobNotCompleted
annotations:
- description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.'
+ description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobnotcompleted'
summary: 'Job did not complete in time'
condition: '{{ true }}'
@@ -215,7 +215,7 @@ rules:
severity: warning
- alert: KubeJobFailed
annotations:
- description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.'
+ description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubejobfailed'
summary: 'Job failed to complete.'
condition: '{{ true }}'
@@ -225,7 +225,7 @@ rules:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
- description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.'
+ description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpareplicasmismatch'
summary: 'HPA has not matched desired number of replicas.'
condition: '{{ true }}'
@@ -248,7 +248,7 @@ rules:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
- description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.'
+ description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubehpamaxedout'
summary: 'HPA is running at max replicas'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml
index e7e4352be..196f31ccb 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-resources
rules:
@@ -61,7 +61,7 @@ rules:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
- description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
+ description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaalmostfull'
summary: 'Namespace quota is going to be full.'
condition: '{{ true }}'
@@ -75,7 +75,7 @@ rules:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
- description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
+ description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotafullyused'
summary: 'Namespace quota is fully used.'
condition: '{{ true }}'
@@ -89,7 +89,7 @@ rules:
severity: info
- alert: KubeQuotaExceeded
annotations:
- description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
+ description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubequotaexceeded'
summary: 'Namespace quota has exceeded the limits.'
condition: '{{ true }}'
@@ -103,7 +103,7 @@ rules:
severity: warning
- alert: CPUThrottlingHigh
annotations:
- description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
+ description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/cputhrottlinghigh'
summary: 'Processes experience elevated CPU throttling.'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-storage.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-storage.yaml
index b74b40f1a..90cfca2ce 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-storage.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-storage.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-storage
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-apiserver.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-apiserver.yaml
index 4c8ec8bbb..52fa7f877 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-apiserver.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-apiserver.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-system-apiserver
rules:
@@ -43,7 +43,7 @@ rules:
severity: warning
- alert: KubeAggregatedAPIDown
annotations:
- description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.'
+ description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeaggregatedapidown'
summary: 'Kubernetes aggregated API is down.'
condition: '{{ true }}'
@@ -63,7 +63,7 @@ rules:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
- description: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
+ description: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeapiterminatedrequests'
summary: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-controller-manager.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-controller-manager.yaml
index 19fa19786..9d22d2f57 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-controller-manager.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-controller-manager.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeControllerManager).enabled }}'
name: kubernetes-system-controller-manager
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-kubelet.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-kubelet.yaml
index 45abe55a9..16beab649 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-kubelet.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-kubelet.yaml
@@ -1,13 +1,13 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-system-kubelet
rules:
- alert: KubeNodeNotReady
annotations:
- description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
+ description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubenodenotready'
summary: 'Node is not ready.'
condition: '{{ true }}'
@@ -17,7 +17,7 @@ rules:
severity: warning
- alert: KubeNodeUnreachable
annotations:
- description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
+ description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubenodeunreachable'
summary: 'Node is unreachable.'
condition: '{{ true }}'
@@ -27,24 +27,28 @@ rules:
severity: warning
- alert: KubeletTooManyPods
annotations:
- description: 'Kubelet ''{{`{{`}} $labels.node {{`}}`}}'' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.'
+ description: 'Kubelet ''{{`{{`}} $labels.node {{`}}`}}'' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubelettoomanypods'
summary: 'Kubelet is running at capacity.'
condition: '{{ true }}'
expr: |-
count by (node,{{ $clusterLabel }}) (
- (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,{{ $clusterLabel }}) group_left(node) topk by (instance,pod,namespace,{{ $clusterLabel }}) (1, kube_pod_info{job="kube-state-metrics"})
+ (kube_pod_status_phase{job="kube-state-metrics", phase="Running"} == 1)
+ * on (namespace,pod,{{ $clusterLabel }}) group_left (node)
+ group by (namespace,pod,node,{{ $clusterLabel }}) (
+ kube_pod_info{job="kube-state-metrics"}
+ )
)
/
max by (node,{{ $clusterLabel }}) (
- kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
+ kube_node_status_capacity{job="kube-state-metrics", resource="pods"} != 1
) > 0.95
for: 15m
labels:
severity: info
- alert: KubeNodeReadinessFlapping
annotations:
- description: 'The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.'
+ description: 'The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubenodereadinessflapping'
summary: 'Node readiness status is flapping.'
condition: '{{ true }}'
@@ -54,7 +58,7 @@ rules:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
- description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
+ description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletplegdurationhigh'
summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.'
condition: '{{ true }}'
@@ -64,7 +68,7 @@ rules:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
- description: 'Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
+ description: 'Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh'
summary: 'Kubelet Pod startup latency is too high.'
condition: '{{ true }}'
@@ -74,7 +78,7 @@ rules:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
- description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
+ description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
summary: 'Kubelet client certificate is about to expire.'
condition: '{{ true }}'
@@ -83,7 +87,7 @@ rules:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
- description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
+ description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
summary: 'Kubelet client certificate is about to expire.'
condition: '{{ true }}'
@@ -92,7 +96,7 @@ rules:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
- description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
+ description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
summary: 'Kubelet server certificate is about to expire.'
condition: '{{ true }}'
@@ -101,7 +105,7 @@ rules:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
- description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
+ description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}} on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
summary: 'Kubelet server certificate is about to expire.'
condition: '{{ true }}'
@@ -110,7 +114,7 @@ rules:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
- description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
+ description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes) on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors'
summary: 'Kubelet has failed to renew its client certificate.'
condition: '{{ true }}'
@@ -120,7 +124,7 @@ rules:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
- description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
+ description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes) on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors'
summary: 'Kubelet has failed to renew its server certificate.'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-scheduler.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-scheduler.yaml
index af34e25a1..9d8fc6631 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-scheduler.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-scheduler.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeScheduler).enabled }}'
name: kubernetes-system-scheduler
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system.yaml
index c0e667d4e..5ecd6ea82 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system.yaml
@@ -1,13 +1,13 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: kubernetes-system
rules:
- alert: KubeVersionMismatch
annotations:
- description: 'There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.'
+ description: 'There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeversionmismatch'
summary: 'Different semantic versions of Kubernetes components running.'
condition: '{{ true }}'
@@ -17,7 +17,7 @@ rules:
severity: warning
- alert: KubeClientErrors
annotations:
- description: 'Kubernetes API server client ''{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}'' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'''
+ description: 'Kubernetes API server client ''{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}'' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors on cluster {{`{{`}} $labels.cluster {{`}}`}}.'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeclienterrors'
summary: 'Kubernetes API server client is experiencing errors.'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.rules.yaml
index 1baf9e203..9271b07c6 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: node-exporter.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.yaml
index 6b6ea39ff..56b76e040 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: node-exporter
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/node-network.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/node-network.yaml
index 21c51e64c..6d7208d11 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/node-network.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/node-network.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: node-network
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/node.rules.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/node.rules.yaml
index 4e5bdd89d..354db8ca5 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/node.rules.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/node.rules.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: node.rules
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/vm-health.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/vm-health.yaml
index 8defcccf4..32169543d 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/vm-health.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/vm-health.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: vm-health
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmagent.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmagent.yaml
index 08b58ca2e..c1ad494fd 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmagent.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmagent.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
concurrency: 2
condition: '{{ true }}'
interval: 30s
@@ -9,7 +9,7 @@ name: vmagent
rules:
- alert: PersistentQueueIsDroppingData
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=49&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=49&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Vmagent dropped {{`{{`}} $value | humanize1024 {{`}}`}} from persistent queue on instance {{`{{`}} $labels.instance {{`}}`}} for the last 10m.'
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} is dropping data from persistent queue'
condition: '{{ true }}'
@@ -19,7 +19,7 @@ rules:
severity: critical
- alert: RejectedRemoteWriteDataBlocksAreDropped
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=79&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=79&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} drops the rejected by remote-write server data blocks. Check the logs to find the reason for rejects.'
summary: 'Vmagent is dropping data blocks that are rejected by remote storage'
condition: '{{ true }}'
@@ -29,7 +29,7 @@ rules:
severity: warning
- alert: TooManyScrapeErrors
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=31&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=31&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} fails to scrape targets for last 15m'
summary: 'Vmagent fails to scrape one or more targets'
condition: '{{ true }}'
@@ -39,7 +39,7 @@ rules:
severity: warning
- alert: TooManyWriteErrors
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=77&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=77&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} responds with errors to write requests for last 15m.'
summary: 'Vmagent responds with too many errors on data ingestion protocols'
condition: '{{ true }}'
@@ -52,7 +52,7 @@ rules:
severity: warning
- alert: TooManyRemoteWriteErrors
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=61&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=61&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Vmagent fails to push data via remote write protocol to destination \"{{`{{`}} $labels.url {{`}}`}}\"\n Ensure that destination is up and reachable."
summary: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} fails to push to remote storage'
condition: '{{ true }}'
@@ -62,7 +62,7 @@ rules:
severity: warning
- alert: RemoteWriteConnectionIsSaturated
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=84&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=84&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "The remote write connection between vmagent \"{{`{{`}} $labels.job {{`}}`}}\" (instance {{`{{`}} $labels.instance {{`}}`}}) and destination \"{{`{{`}} $labels.url {{`}}`}}\" is saturated by more than 90% and vmagent won't be able to keep up.\n This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage."
summary: 'Remote write connection from "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) to {{`{{`}} $labels.url {{`}}`}} is saturated'
condition: '{{ true }}'
@@ -72,7 +72,7 @@ rules:
severity: warning
- alert: PersistentQueueForWritesIsSaturated
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=98&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=98&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Persistent queue writes for vmagent "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) are saturated by more than 90% and vmagent won''t be able to keep up with flushing data on disk. In this case, consider to decrease load on the vmagent or improve the disk throughput.'
summary: 'Persistent queue writes for instance {{`{{`}} $labels.instance {{`}}`}} are saturated'
condition: '{{ true }}'
@@ -82,7 +82,7 @@ rules:
severity: warning
- alert: PersistentQueueForReadsIsSaturated
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=99&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=99&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Persistent queue reads for vmagent "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) are saturated by more than 90% and vmagent won''t be able to keep up with reading data from the disk. In this case, consider to decrease load on the vmagent or improve the disk throughput.'
summary: 'Persistent queue reads for instance {{`{{`}} $labels.instance {{`}}`}} are saturated'
condition: '{{ true }}'
@@ -92,7 +92,7 @@ rules:
severity: warning
- alert: SeriesLimitHourReached
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=88&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=88&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems.'
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} reached 90% of the limit'
condition: '{{ true }}'
@@ -101,7 +101,7 @@ rules:
severity: critical
- alert: SeriesLimitDayReached
annotations:
- dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=90&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/G7Z9GzMGz?viewPanel=90&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems.'
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} reached 90% of the limit'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmcluster.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmcluster.yaml
index 8d5cca128..c4ee9b7b0 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmcluster.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmcluster.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
concurrency: 2
condition: '{{ true }}'
interval: 30s
@@ -9,7 +9,7 @@ name: vmcluster
rules:
- alert: DiskRunsOutOfSpaceIn3Days
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=113&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=113&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Taking into account current ingestion rate, free disk space will be enough only for {{`{{`}} $value | humanizeDuration {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} will run out of disk space in 3 days'
condition: '{{ true }}'
@@ -26,7 +26,7 @@ rules:
severity: critical
- alert: NodeBecomesReadonlyIn3Days
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=113&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=113&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Taking into account current ingestion rate, free disk space and -storage.minFreeDiskSpaceBytes instance {{`{{`}} $labels.instance {{`}}`}} will remain writable for {{`{{`}} $value | humanizeDuration {{`}}`}}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} will become read-only in 3 days'
condition: '{{ true }}'
@@ -43,7 +43,7 @@ rules:
severity: warning
- alert: DiskRunsOutOfSpace
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=200&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=200&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Disk utilisation on instance {{`{{`}} $labels.instance {{`}}`}} is more than 80%.\n Having less than 20% of free disk space could cripple merges processes and overall performance. Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} (job={{`{{`}} $labels.job {{`}}`}}) will run out of disk space soon'
condition: '{{ true }}'
@@ -58,7 +58,7 @@ rules:
severity: critical
- alert: RequestErrorsToAPI
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=52&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=52&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Requests to path {{`{{`}} $labels.path {{`}}`}} are receiving errors. Please verify if clients are sending correct requests.'
summary: 'Too many errors served for {{`{{`}} $labels.job {{`}}`}} path {{`{{`}} $labels.path {{`}}`}} (instance {{`{{`}} $labels.instance {{`}}`}})'
condition: '{{ true }}'
@@ -69,7 +69,7 @@ rules:
show_at: dashboard
- alert: RPCErrors
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=44&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=44&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "RPC errors are interconnection errors between cluster components.\n Possible reasons for errors are misconfiguration, overload, network blips or unreachable components."
summary: 'Too many RPC errors for {{`{{`}} $labels.job {{`}}`}} (instance {{`{{`}} $labels.instance {{`}}`}})'
condition: '{{ true }}'
@@ -87,7 +87,7 @@ rules:
show_at: dashboard
- alert: TooHighChurnRate
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=102'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=102'
description: "VM constantly creates new time series.\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries."
summary: 'Churn rate is more than 10% for the last 15m'
condition: '{{ true }}'
@@ -102,7 +102,7 @@ rules:
severity: warning
- alert: TooHighChurnRate24h
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=102'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=102'
description: "The number of created new time series over last 24h is 3x times higher than current number of active series.\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries."
summary: 'Too high number of new series created over last 24h'
condition: '{{ true }}'
@@ -115,7 +115,7 @@ rules:
severity: warning
- alert: TooHighSlowInsertsRate
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=108'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=108'
description: 'High rate of slow inserts may be a sign of resource exhaustion for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series. See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183'
summary: 'Percentage of slow inserts is more than 5% for the last 15m'
condition: '{{ true }}'
@@ -130,7 +130,7 @@ rules:
severity: warning
- alert: VminsertVmstorageConnectionIsSaturated
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=139&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=139&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "The connection between vminsert (instance {{`{{`}} $labels.instance {{`}}`}}) and vmstorage (instance {{`{{`}} $labels.addr {{`}}`}}) is saturated by more than 90% and vminsert won't be able to keep up.\n This usually means that more vminsert or vmstorage nodes must be added to the cluster in order to increase the total number of vminsert -> vmstorage links."
summary: 'Connection between vminsert on {{`{{`}} $labels.instance {{`}}`}} and vmstorage on {{`{{`}} $labels.addr {{`}}`}} is saturated'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmoperator.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmoperator.yaml
index f7ee310de..62547e6c2 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmoperator.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmoperator.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: vmoperator
rules:
diff --git a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmsingle.yaml b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmsingle.yaml
index af41585fa..8108be292 100644
--- a/charts/victoria-metrics-k8s-stack/files/rules/generated/vmsingle.yaml
+++ b/charts/victoria-metrics-k8s-stack/files/rules/generated/vmsingle.yaml
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
-{{- $host := index (($Values.grafana).ingress).hosts 0 }}
+{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
concurrency: 2
condition: '{{ true }}'
interval: 30s
@@ -9,7 +9,7 @@ name: vmsingle
rules:
- alert: DiskRunsOutOfSpaceIn3Days
annotations:
- dashboard: '{{ $host }}/d/wNf0q_kZk?viewPanel=73&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/wNf0q_kZk?viewPanel=73&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Taking into account current ingestion rate, free disk space will be enough only for {{`{{`}} $value | humanizeDuration {{`}}`}} on instance {{`{{`}} $labels.instance {{`}}`}}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} will run out of disk space soon'
condition: '{{ true }}'
@@ -26,7 +26,7 @@ rules:
severity: critical
- alert: NodeBecomesReadonlyIn3Days
annotations:
- dashboard: '{{ $host }}/d/oS7Bi_0Wz?viewPanel=113&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/oS7Bi_0Wz?viewPanel=113&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Taking into account current ingestion rate and free disk space instance {{`{{`}} $labels.instance {{`}}`}} is writable for {{`{{`}} $value | humanizeDuration {{`}}`}}.\n Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} will become read-only in 3 days'
condition: '{{ true }}'
@@ -43,7 +43,7 @@ rules:
severity: warning
- alert: DiskRunsOutOfSpace
annotations:
- dashboard: '{{ $host }}/d/wNf0q_kZk?viewPanel=53&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/wNf0q_kZk?viewPanel=53&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Disk utilisation on instance {{`{{`}} $labels.instance {{`}}`}} is more than 80%.\n Having less than 20% of free disk space could cripple merge processes and overall performance. Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} (job={{`{{`}} $labels.job {{`}}`}}) will run out of disk space soon'
condition: '{{ true }}'
@@ -58,7 +58,7 @@ rules:
severity: critical
- alert: RequestErrorsToAPI
annotations:
- dashboard: '{{ $host }}/d/wNf0q_kZk?viewPanel=35&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/wNf0q_kZk?viewPanel=35&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Requests to path {{`{{`}} $labels.path {{`}}`}} are receiving errors. Please verify if clients are sending correct requests.'
summary: 'Too many errors served for path {{`{{`}} $labels.path {{`}}`}} (instance {{`{{`}} $labels.instance {{`}}`}})'
condition: '{{ true }}'
@@ -68,7 +68,7 @@ rules:
severity: warning
- alert: TooHighChurnRate
annotations:
- dashboard: '{{ $host }}/d/wNf0q_kZk?viewPanel=66&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/wNf0q_kZk?viewPanel=66&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "VM constantly creates new time series on \"{{`{{`}} $labels.instance {{`}}`}}\".\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries."
summary: 'Churn rate is more than 10% on "{{`{{`}} $labels.instance {{`}}`}}" for the last 15m'
condition: '{{ true }}'
@@ -83,7 +83,7 @@ rules:
severity: warning
- alert: TooHighChurnRate24h
annotations:
- dashboard: '{{ $host }}/d/wNf0q_kZk?viewPanel=66&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/wNf0q_kZk?viewPanel=66&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "The number of created new time series over last 24h is 3x times higher than current number of active series on \"{{`{{`}} $labels.instance {{`}}`}}\".\n This effect is known as Churn Rate.\n High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries."
summary: 'Too high number of new series on "{{`{{`}} $labels.instance {{`}}`}}" created over last 24h'
condition: '{{ true }}'
@@ -96,7 +96,7 @@ rules:
severity: warning
- alert: TooHighSlowInsertsRate
annotations:
- dashboard: '{{ $host }}/d/wNf0q_kZk?viewPanel=68&var-instance={{`{{`}} $labels.instance {{`}}`}}'
+ dashboard: '{{ $grafanaHost }}/d/wNf0q_kZk?viewPanel=68&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'High rate of slow inserts on "{{`{{`}} $labels.instance {{`}}`}}" may be a sign of resource exhaustion for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series. See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183'
summary: 'Percentage of slow inserts is more than 5% on "{{`{{`}} $labels.instance {{`}}`}}" for the last 15m'
condition: '{{ true }}'
diff --git a/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl b/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl
index bbb1e19e9..c8a891ebe 100644
--- a/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl
+++ b/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl
@@ -14,8 +14,8 @@
{{- $baseURL := include "vm.url" . -}}
{{- $tenant := $Values.tenant | default 0 -}}
{{- $_ := set $endpoint "url" (printf "%s/select/%d/prometheus" $baseURL (int $tenant)) -}}
- {{- else if $Values.externalVM.read.url -}}
- {{- $endpoint = $Values.externalVM.read -}}
+ {{- else if $Values.external.vm.read.url -}}
+ {{- $endpoint = $Values.external.vm.read -}}
{{- end -}}
{{- toYaml $endpoint -}}
{{- end }}
@@ -33,8 +33,8 @@
{{- $baseURL := include "vm.url" . -}}
{{- $tenant := $Values.tenant | default 0 -}}
{{- $_ := set $endpoint "url" (printf "%s/insert/%d/prometheus/api/v1/write" $baseURL (int $tenant)) -}}
- {{- else if $Values.externalVM.write.url -}}
- {{- $endpoint = $Values.externalVM.write -}}
+ {{- else if $Values.external.vm.write.url -}}
+ {{- $endpoint = $Values.external.vm.write -}}
{{- end -}}
{{- toYaml $endpoint -}}
{{- end -}}
@@ -135,7 +135,7 @@
{{- define "vm.agent.remote.write" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $remoteWrites := $Values.vmagent.additionalRemoteWrites | default list -}}
- {{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.externalVM.write.url -}}
+ {{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.external.vm.write.url -}}
{{- $remoteWrites = append $remoteWrites (fromYaml (include "vm.write.endpoint" .)) -}}
{{- end -}}
{{- toYaml (dict "remoteWrite" $remoteWrites) -}}
@@ -169,12 +169,12 @@
{{- $readURL := urlParse (include "vm.url" .) -}}
{{- $_ := set $readURL "path" (printf "%s/select" $readURL.path) -}}
{{- $_ := set . "vm" (dict "read" $readURL "write" $writeURL) -}}
- {{- else if or $Values.externalVM.read.url $Values.externalVM.write.url -}}
+ {{- else if or $Values.external.vm.read.url $Values.external.vm.write.url -}}
{{- $_ := set . "vm" (default dict) -}}
- {{- with $Values.externalVM.read.url -}}
+ {{- with $Values.external.vm.read.url -}}
{{- $_ := set $.vm "read" (urlParse .) -}}
{{- end -}}
- {{- with $Values.externalVM.write.url -}}
+ {{- with $Values.external.vm.write.url -}}
{{- $_ := set $.vm "write" (urlParse .) -}}
{{- end -}}
{{- end -}}
@@ -265,7 +265,7 @@
{{- end }}
{{- end }}
{{- $unsignedPlugins := ((index $grafana "grafana.ini").plugins).allow_loading_unsigned_plugins | default "" -}}
- {{- $allowUnsigned := contains "victoriametrics-datasource" $unsignedPlugins -}}
+ {{- $allowUnsigned := contains "victoriametrics-metrics-datasource" $unsignedPlugins -}}
{{- ternary "true" "" (and $isEnabled $allowUnsigned) -}}
{{- else -}}
{{ "true" }}
@@ -277,7 +277,7 @@
{{- $ctx := . }}
{{- $Values := (.helm).Values | default .Values }}
{{- $datasources := $Values.defaultDatasources.extra | default list -}}
- {{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.externalVM.read -}}
+ {{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.external.vm.read -}}
{{- $readEndpoint:= include "vm.read.endpoint" $ctx | fromYaml -}}
{{- $defaultDatasources := default list -}}
{{- range $ds := $Values.defaultDatasources.victoriametrics.datasources }}
diff --git a/charts/victoria-metrics-k8s-stack/values.yaml b/charts/victoria-metrics-k8s-stack/values.yaml
index d30e94ed8..29f473c4b 100644
--- a/charts/victoria-metrics-k8s-stack/values.yaml
+++ b/charts/victoria-metrics-k8s-stack/values.yaml
@@ -248,18 +248,22 @@ additionalVictoriaMetricsMap:
# - record: my_record
# expr: 100 * my_record
-# -- External VM read and write URLs
-externalVM:
- read:
- url: ""
- # bearerTokenSecret:
- # name: dbaas-read-access-token
- # key: bearerToken
- write:
- url: ""
- # bearerTokenSecret:
- # name: dbaas-read-access-token
- # key: bearerToken
+external:
+ # -- External Grafana host
+ grafana:
+ host: grafana.external.host
+ # -- External VM read and write URLs
+ vm:
+ read:
+ url: ""
+ # bearerTokenSecret:
+ # name: dbaas-read-access-token
+ # key: bearerToken
+ write:
+ url: ""
+ # bearerTokenSecret:
+ # name: dbaas-read-access-token
+ # key: bearerToken
# Configures vmsingle params
vmsingle:
@@ -732,8 +736,8 @@ vmauth:
annotations: {}
# -- (object) Full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec)
# It's possible to use given below predefined variables in spec:
- # * `{{ .vm.read }}` - parsed vmselect, vmsingle or externalVM.read URL
- # * `{{ .vm.write }}` - parsed vminsert, vmsingle or externalVM.write URL
+ # * `{{ .vm.read }}` - parsed vmselect, vmsingle or external.vm.read URL
+ # * `{{ .vm.write }}` - parsed vminsert, vmsingle or external.vm.write URL
spec:
port: "8427"
unauthorizedUserAccessSpec:
@@ -809,7 +813,7 @@ defaultDatasources:
isDefault: true
- name: VictoriaMetrics (DS)
isDefault: false
- type: victoriametrics-datasource
+ type: victoriametrics-metrics-datasource
# -- List of alertmanager datasources.
# Alertmanager generated `url` will be added to each datasource in template if alertmanager is enabled
alertmanager:
@@ -861,11 +865,11 @@ grafana:
# Note that Grafana will need internet access to install the datasource plugin.
# Uncomment the block below, if you want to enable VictoriaMetrics Datasource in Grafana:
#plugins:
- # - "https://github.com/VictoriaMetrics/victoriametrics-datasource/releases/download/v0.10.1/victoriametrics-datasource-v0.10.1.zip;victoriametrics-datasource"
+ # - "https://github.com/VictoriaMetrics/victoriametrics-datasource/releases/download/v0.12.0/victoriametrics-metrics-datasource-v0.12.0.zip;victoriametrics-metrics-datasource"
#grafana.ini:
# plugins:
# # Why VictoriaMetrics datasource is unsigned: https://github.com/VictoriaMetrics/grafana-datasource/blob/main/README.md#why-victoriametrics-datasource-is-unsigned
- # allow_loading_unsigned_plugins: victoriametrics-datasource
+ # allow_loading_unsigned_plugins: victoriametrics-metrics-datasource
ingress:
enabled: false
diff --git a/charts/victoria-metrics-operator/README.md b/charts/victoria-metrics-operator/README.md
index 0ee400775..51d2b1a96 100644
--- a/charts/victoria-metrics-operator/README.md
+++ b/charts/victoria-metrics-operator/README.md
@@ -259,6 +259,7 @@ Change the values according to the need of the environment in ``victoria-metrics
commonName: ca.validation.victoriametrics
duration: 63800h0m0s
cert:
+ commonName: ""
duration: 45800h0m0s
enabled: false
issuer: {}
@@ -294,6 +295,7 @@ tls:
commonName: ca.validation.victoriametrics
duration: 63800h0m0s
cert:
+ commonName: ""
duration: 45800h0m0s
enabled: false
issuer: {}
@@ -319,7 +321,8 @@ duration: 63800h0m0s
admissionWebhooks.certManager.cert |
object |
-duration: 45800h0m0s
+commonName: ""
+duration: 45800h0m0s
|
diff --git a/charts/victoria-metrics-operator/templates/webhook.yaml b/charts/victoria-metrics-operator/templates/webhook.yaml
index 82c4f342b..02802d476 100644
--- a/charts/victoria-metrics-operator/templates/webhook.yaml
+++ b/charts/victoria-metrics-operator/templates/webhook.yaml
@@ -4,12 +4,13 @@
{{- $fullname := include "vm.plain.fullname" $ctx }}
{{- $domain := ((.Values.global).cluster).dnsDomain }}
{{- $ns := include "vm.namespace" $ctx }}
+{{- $certManager := .Values.admissionWebhooks.certManager }}
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: {{ $fullname }}-admission
- {{- if .Values.admissionWebhooks.certManager.enabled }}
+ {{- if $certManager.enabled }}
annotations:
certmanager.k8s.io/inject-ca-from: {{ printf "%s/%s-validation" $ns $fullname | quote }}
cert-manager.io/inject-ca-from: {{ printf "%s/%s-validation" $ns $fullname | quote }}
@@ -24,7 +25,7 @@ webhooks:
name: {{ $fullname }}
path: /validate-operator-victoriametrics-com-v1beta1-{{ $name }}
port: {{ $.Values.service.webhookPort }}
- {{- if not $.Values.admissionWebhooks.certManager.enabled }}
+ {{- if not $certManager.enabled }}
caBundle: {{ $tls.caCert }}
{{- end }}
failurePolicy: {{ $.Values.admissionWebhooks.policy }}
@@ -48,8 +49,8 @@ webhooks:
- {{ $name }}{{ ternary "" "s" (hasSuffix "s" $name) }}
{{- end }}
{{- end }}
-{{- if .Values.admissionWebhooks.certManager.enabled }}
-{{- if not .Values.admissionWebhooks.certManager.issuer }}
+{{- if $certManager.enabled }}
+{{- if not $certManager.issuer }}
---
apiVersion: cert-manager.io/v1
kind: Issuer
@@ -66,10 +67,10 @@ metadata:
namespace: {{ $ns }}
spec:
secretName: {{ $fullname }}-root-ca
- duration: {{ .Values.admissionWebhooks.certManager.ca.duration }}
+ duration: {{ $certManager.ca.duration }}
issuerRef:
name: {{ $fullname }}-root
- commonName: {{ .Values.admissionWebhooks.certManager.ca.commonName }}
+ commonName: {{ $certManager.ca.commonName }}
isCA: true
---
apiVersion: cert-manager.io/v1
@@ -90,8 +91,11 @@ metadata:
namespace: {{ $ns }}
spec:
secretName: {{ $fullname }}-validation
- duration: {{ .Values.admissionWebhooks.certManager.cert.duration }}
- {{- $issuerRef := .Values.admissionWebhooks.certManager.issuer | default dict }}
+ duration: {{ $certManager.cert.duration }}
+ {{- with $certManager.cert.commonName }}
+ commonName: {{ . }}
+ {{- end }}
+ {{- $issuerRef := $certManager.issuer | default dict }}
{{- if empty $issuerRef }}
{{- $_ := set $issuerRef "name" (printf "%s-issuer" $fullname) }}
{{- end }}
diff --git a/charts/victoria-metrics-operator/values.yaml b/charts/victoria-metrics-operator/values.yaml
index f390a178d..f145115db 100644
--- a/charts/victoria-metrics-operator/values.yaml
+++ b/charts/victoria-metrics-operator/values.yaml
@@ -265,6 +265,7 @@ admissionWebhooks:
# -- Certificate parameters
cert:
duration: 45800h0m0s
+ commonName: ""
keepTLSSecret: true
# tls specifies TLS cert/key for the webhook
tls:
diff --git a/hack/rules-and-dashboards/sync_rules.py b/hack/rules-and-dashboards/sync_rules.py
index 5b39bc53c..068ed57e0 100644
--- a/hack/rules-and-dashboards/sync_rules.py
+++ b/hack/rules-and-dashboards/sync_rules.py
@@ -178,7 +178,7 @@ def cluster_label_var(mo):
"limitGroup": ["kubernetes-storage"],
},
"http://localhost:3000": {
- "replacement": "[[ $host ]]",
+ "replacement": "[[ $grafanaHost ]]",
"init": "",
},
'job="alertmanager-main"': {
@@ -275,7 +275,7 @@ def write_group_to_file(group, url, charts):
content += "{{- $Values := (.helm).Values | default .Values }}\n"
content += '{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}\n'
content += '{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}\n'
- content += "{{- $host := index (($Values.grafana).ingress).hosts 0 }}\n"
+ content += "{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}\n"
content += escape(lines)
f.write(content)