Skip to content

Commit

Permalink
k8s-stack: added grafana external host management. fixes #1946 (#1947)
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewChubatiuk authored Jan 23, 2025
1 parent b26141b commit 34903ed
Show file tree
Hide file tree
Showing 52 changed files with 230 additions and 200 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
concurrency: 2
condition: '{{ true }}'
interval: 30s
Expand Down
5 changes: 4 additions & 1 deletion charts/victoria-metrics-k8s-stack/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
## Next release

- TODO
**Update note**: This release contains breaking change. `.Values.externalVM` was renamed to `.Values.external.vm`

- add `.Values.external.grafana.host` to configure grafana host for alerts, when `.Values.grafana.enabled: false`
- rename `.Values.externalVM` to `.Values.external.vm` for consistency

## 0.34.0

Expand Down
17 changes: 14 additions & 3 deletions charts/victoria-metrics-k8s-stack/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -845,7 +845,7 @@ perReplica: false
type: prometheus
- isDefault: false
name: VictoriaMetrics (DS)
type: victoriametrics-datasource
type: victoriametrics-metrics-datasource
</code>
</pre>
</td>
Expand Down Expand Up @@ -1326,7 +1326,18 @@ vmsingle:
</td>
</tr>
<tr>
<td>externalVM</td>
<td>external.grafana</td>
<td>object</td>
<td><pre class="helm-vars-default-value language-yaml" lang="plaintext">
<code class="language-yaml">host: grafana.external.host
</code>
</pre>
</td>
<td><p>External Grafana host</p>
</td>
</tr>
<tr>
<td>external.vm</td>
<td>object</td>
<td><pre class="helm-vars-default-value language-yaml" lang="plaintext">
<code class="language-yaml">read:
Expand Down Expand Up @@ -2475,7 +2486,7 @@ unauthorizedUserAccessSpec:
</code>
</pre>
</td>
<td><p>Full spec for VMAuth CRD. Allowed values described <a href="https://docs.victoriametrics.com/operator/api#vmauthspec" target="_blank">here</a> It&rsquo;s possible to use given below predefined variables in spec: * <code>{{ .vm.read }}</code> - parsed vmselect, vmsingle or externalVM.read URL * <code>{{ .vm.write }}</code> - parsed vminsert, vmsingle or externalVM.write URL</p>
<td><p>Full spec for VMAuth CRD. Allowed values described <a href="https://docs.victoriametrics.com/operator/api#vmauthspec" target="_blank">here</a> It&rsquo;s possible to use given below predefined variables in spec: * <code>{{ .vm.read }}</code> - parsed vmselect, vmsingle or external.vm.read URL * <code>{{ .vm.write }}</code> - parsed vminsert, vmsingle or external.vm.write URL</p>
</td>
</tr>
<tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, url, le))
legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, le))
legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Post Request Latency 99th Quantile
type: timeseries
- datasource:
Expand Down Expand Up @@ -268,8 +268,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, url, le))
legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-controller-manager", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, le))
legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Get Request Latency 99th Quantile
type: timeseries
- datasource:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -722,8 +722,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster",job="kubelet", metrics_path="/metrics", instance=~"$instance"}[$__rate_interval])) by (instance, verb, url, le))
legendFormat: '{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster",job="kubelet", metrics_path="/metrics", instance=~"$instance"}[$__rate_interval])) by (instance, verb, le))
legendFormat: '{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}}'
title: Request duration 99th quantile
type: timeseries
- datasource:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy",instance=~"$instance",verb="POST"}[$__rate_interval])) by (verb, url, le))
legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy",instance=~"$instance",verb="POST"}[$__rate_interval])) by (verb, le))
legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Post Request Latency 99th Quantile
type: timeseries
- datasource:
Expand Down Expand Up @@ -304,8 +304,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, url, le))
legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-proxy", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, le))
legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Get Request Latency 99th Quantile
type: timeseries
- datasource:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, url, le))
legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="POST"}[$__rate_interval])) by (verb, le))
legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Post Request Latency 99th Quantile
type: timeseries
- datasource:
Expand Down Expand Up @@ -262,8 +262,8 @@ panels:
- datasource:
type: {{ $defaultDatasource }}
uid: ${datasource}
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, url, le))
legendFormat: '{{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}'
expr: histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{ {{ $clusterLabel }}=~"$cluster", job="kube-scheduler", instance=~"$instance", verb="GET"}[$__rate_interval])) by (verb, le))
legendFormat: '{{`{{`}}verb{{`}}`}}'
title: Get Request Latency 99th Quantile
type: timeseries
- datasource:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.alertmanager).enabled }}'
name: alertmanager.rules
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeEtcd).enabled }}'
name: etcd
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: general.rules
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_cpu_limits
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_cpu_requests
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_cpu_usage_seconds_total
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_cache
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_limits
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_requests
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_rss
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_swap
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.container_memory_working_set_bytes
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ true }}'
name: k8s.rules.pod_owner
rules:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
{{- $grafanaHost := ternary (index (($Values.grafana).ingress).hosts 0) (($Values.external).grafana).host ($Values.grafana).enabled }}
condition: '{{ ($Values.kubeApiServer).enabled }}'
interval: 3m
name: kube-apiserver-availability.rules
Expand Down Expand Up @@ -38,22 +38,22 @@ rules:
# write too slow
sum by ({{ $clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"})
) +
(
# read too slow
sum by ({{ $clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
-
(
(
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"})
or
vector(0)
)
+
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"})
+
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"})
)
) +
# errors
Expand All @@ -72,14 +72,14 @@ rules:
(
# too slow
(
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"})
or
vector(0)
)
+
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"})
+
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"})
)
+
# errors
Expand All @@ -97,7 +97,7 @@ rules:
# too slow
sum by ({{ $clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
sum by ({{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"})
)
+
# errors
Expand Down
Loading

0 comments on commit 34903ed

Please sign in to comment.