Skip to content

Commit

Permalink
updated rules and dashboards, release vlogs 0.8.4
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewChubatiuk committed Nov 26, 2024
1 parent 19ec8f9 commit 3383b67
Show file tree
Hide file tree
Showing 18 changed files with 109 additions and 73 deletions.
2 changes: 2 additions & 0 deletions charts/victoria-logs-single/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
## Next release

- updated common dependency 0.0.31 -> 0.0.32
- synced rules and dashboards
- added .Values.dashboards.namespace to override default namespace for dashboards

## 0.8.3

Expand Down
2 changes: 1 addition & 1 deletion charts/victoria-logs-single/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ apiVersion: v2
appVersion: v1.0.0
description: Victoria Logs Single version - high-performance, cost-effective and scalable logs storage
name: victoria-logs-single
version: 0.8.3
version: 0.8.4
sources:
- https://github.com/VictoriaMetrics/helm-charts
icon: https://avatars.githubusercontent.com/u/43720803?s=200&v=4
Expand Down
11 changes: 11 additions & 0 deletions charts/victoria-logs-single/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,17 @@ Change the values according to the need of the environment in ``victoria-logs-si
</pre>
</td>
<td><p>Dashboard labels</p>
</td>
</tr>
<tr>
<td>dashboards.namespace</td>
<td>string</td>
<td><pre class="helm-vars-default-value" language-yaml" lang="">
<code class="language-yaml">""
</code>
</pre>
</td>
<td><p>Override default namespace, where to create dashboards</p>
</td>
</tr>
<tr>
Expand Down
9 changes: 6 additions & 3 deletions charts/victoria-logs-single/files/rules/generated/vlogs.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
concurrency: 2
condition: '{{ true }}'
interval: 30s
Expand All @@ -10,10 +13,10 @@ rules:
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} (job={{`{{`}} $labels.job {{`}}`}}) will run out of disk space soon'
condition: '{{ true }}'
expr: |-
sum(vl_data_size_bytes) by (job,instance,{{ $Values.global.clusterLabel }}) /
sum(vl_data_size_bytes) by (job,instance,{{ $clusterLabel }}) /
(
sum(vl_free_disk_space_bytes) by (job,instance,{{ $Values.global.clusterLabel }}) +
sum(vl_data_size_bytes) by (job,instance,{{ $Values.global.clusterLabel }})
sum(vl_free_disk_space_bytes) by (job,instance,{{ $clusterLabel }}) +
sum(vl_data_size_bytes) by (job,instance,{{ $clusterLabel }})
) > 0.8
for: 30m
labels:
Expand Down
4 changes: 2 additions & 2 deletions charts/victoria-logs-single/templates/dashboard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Delete condition from dashboard
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
namespace: {{ include "vm.namespace" $ }}
namespace: {{ $.Values.dashboards.namespace | default (include "vm.namespace" $) }}
name: {{ printf "%s-%s" (include "vm.fullname" $) $dashboardName | replace "_" "" }}
labels: {{ include "vm.labels" $ctx | nindent 4 }}
{{- with $.Values.dashboards.annotations }}
Expand All @@ -47,7 +47,7 @@ spec:
apiVersion: v1
kind: ConfigMap
metadata:
namespace: {{ include "vm.namespace" $ }}
namespace: {{ $.Values.dashboards.namespace | default (include "vm.namespace" $) }}
name: {{ printf "%s-%s" (include "vm.fullname" $) $dashboardName | replace "_" "" }}
labels: {{ include "vm.labels" $ctx | nindent 4 }}
{{- with $.Values.dashboards.annotations }}
Expand Down
2 changes: 2 additions & 0 deletions charts/victoria-logs-single/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,8 @@ dashboards:
# grafana_dashboard: "1"
# -- Dashboard annotations
annotations: {}
# -- Override default namespace, where to create dashboards
namespace: ""
grafanaOperator:
enabled: false
spec:
Expand Down
2 changes: 1 addition & 1 deletion charts/victoria-metrics-alert/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ tag: v0.25.0
<td>alertmanager.service.servicePort</td>
<td>int</td>
<td><pre class="helm-vars-default-value" language-yaml" lang="">
<code class="language-yaml">8880
<code class="language-yaml">9093
</code>
</pre>
</td>
Expand Down
2 changes: 1 addition & 1 deletion charts/victoria-metrics-k8s-stack/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## Next release

- TODO
- synced rules

## 0.29.1

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3577,7 +3577,7 @@ panels:
h: 8
w: 12
x: 0
'y': 30
'y': 6
id: 73
options:
legend:
Expand Down Expand Up @@ -3665,7 +3665,7 @@ panels:
h: 8
w: 12
x: 12
'y': 22
'y': 6
id: 131
options:
legend:
Expand Down Expand Up @@ -3743,7 +3743,7 @@ panels:
h: 8
w: 12
x: 0
'y': 30
'y': 14
id: 130
options:
legend:
Expand Down Expand Up @@ -3831,7 +3831,7 @@ panels:
h: 8
w: 12
x: 12
'y': 30
'y': 14
id: 77
options:
legend:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,18 @@ rules:
labels:
verb: write
record: code:apiserver_request_total:increase30d
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ $clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_count{job="apiserver"}[1h]))
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ $clusterLabel }}) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30)
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
- condition: '{{ true }}'
expr: sum by (verb,scope,le,{{ $clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
- condition: '{{ true }}'
expr: sum by (verb,scope,le,{{ $clusterLabel }}) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"})
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ $clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"} * 24 * 30)
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
- condition: '{{ true }}'
expr: |-
1 - (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,11 @@ rules:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.'
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour. (reason: "{{`{{`}} $labels.reason {{`}}`}}").'
runbook_url: '{{ $runbookUrl }}/kubernetes/kubecontainerwaiting'
summary: 'Pod container waiting longer than 1 hour'
condition: '{{ true }}'
expr: sum by (namespace,pod,container,{{ $clusterLabel }}) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) > 0
expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 1h
labels:
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ rules:
summary: 'Processes experience elevated CPU throttling.'
condition: '{{ true }}'
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,{{ $clusterLabel }})
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,{{ $clusterLabel }})
sum(increase(container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
> ( 25 / 100 )
for: 15m
labels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ rules:
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
summary: 'Client certificate is about to expire.'
condition: '{{ true }}'
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on (job,{{ $clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ $clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
expr: |-
histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
and
on (job,instance,{{ $clusterLabel }}) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
for: 5m
labels:
severity: warning
Expand All @@ -21,7 +24,10 @@ rules:
runbook_url: '{{ $runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
summary: 'Client certificate is about to expire.'
condition: '{{ true }}'
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on (job,{{ $clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ $clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
expr: |-
histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
and
on (job,instance,{{ $clusterLabel }}) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
for: 5m
labels:
severity: critical
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
condition: '{{ true }}'
name: vm-health
rules:
Expand Down Expand Up @@ -65,7 +68,7 @@ rules:
'
summary: '"{{`{{`}} $labels.job {{`}}`}}"("{{`{{`}} $labels.instance {{`}}`}}") has insufficient CPU resources for >15m'
condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket[5m])) by (le,job,instance,{{ $Values.global.clusterLabel }})) > 0.1
expr: histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket[5m])) by (le,job,instance,{{ $clusterLabel }})) > 0.1
for: 15m
labels:
severity: critical
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
{{- $Values := (.helm).Values | default .Values }}
{{- $runbookUrl := ($Values.defaultRules).runbookUrl | default "https://runbooks.prometheus-operator.dev/runbooks" }}
{{- $clusterLabel := ($Values.global).clusterLabel | default "cluster" }}
{{- $host := index (($Values.grafana).ingress).hosts 0 }}
concurrency: 2
condition: '{{ true }}'
interval: 30s
name: vmagent
rules:
- alert: PersistentQueueIsDroppingData
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=49&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=49&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Vmagent dropped {{`{{`}} $value | humanize1024 {{`}}`}} from persistent queue on instance {{`{{`}} $labels.instance {{`}}`}} for the last 10m.'
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} is dropping data from persistent queue'
condition: '{{ true }}'
Expand All @@ -16,7 +19,7 @@ rules:
severity: critical
- alert: RejectedRemoteWriteDataBlocksAreDropped
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=79&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=79&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} drops the rejected by remote-write server data blocks. Check the logs to find the reason for rejects.'
summary: 'Vmagent is dropping data blocks that are rejected by remote storage'
condition: '{{ true }}'
Expand All @@ -26,7 +29,7 @@ rules:
severity: warning
- alert: TooManyScrapeErrors
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=31&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=31&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} fails to scrape targets for last 15m'
summary: 'Vmagent fails to scrape one or more targets'
condition: '{{ true }}'
Expand All @@ -36,7 +39,7 @@ rules:
severity: warning
- alert: TooManyWriteErrors
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=77&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=77&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} responds with errors to write requests for last 15m.'
summary: 'Vmagent responds with too many errors on data ingestion protocols'
condition: '{{ true }}'
Expand All @@ -49,7 +52,7 @@ rules:
severity: warning
- alert: TooManyRemoteWriteErrors
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=61&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=61&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "Vmagent fails to push data via remote write protocol to destination \"{{`{{`}} $labels.url {{`}}`}}\"\n Ensure that destination is up and reachable."
summary: 'Job "{{`{{`}} $labels.job {{`}}`}}" on instance {{`{{`}} $labels.instance {{`}}`}} fails to push to remote storage'
condition: '{{ true }}'
Expand All @@ -59,7 +62,7 @@ rules:
severity: warning
- alert: RemoteWriteConnectionIsSaturated
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=84&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=84&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: "The remote write connection between vmagent \"{{`{{`}} $labels.job {{`}}`}}\" (instance {{`{{`}} $labels.instance {{`}}`}}) and destination \"{{`{{`}} $labels.url {{`}}`}}\" is saturated by more than 90% and vmagent won't be able to keep up.\n This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage."
summary: 'Remote write connection from "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) to {{`{{`}} $labels.url {{`}}`}} is saturated'
condition: '{{ true }}'
Expand All @@ -69,7 +72,7 @@ rules:
severity: warning
- alert: PersistentQueueForWritesIsSaturated
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=98&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=98&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Persistent queue writes for vmagent "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) are saturated by more than 90% and vmagent won''t be able to keep up with flushing data on disk. In this case, consider to decrease load on the vmagent or improve the disk throughput.'
summary: 'Persistent queue writes for instance {{`{{`}} $labels.instance {{`}}`}} are saturated'
condition: '{{ true }}'
Expand All @@ -79,7 +82,7 @@ rules:
severity: warning
- alert: PersistentQueueForReadsIsSaturated
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=99&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=99&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Persistent queue reads for vmagent "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) are saturated by more than 90% and vmagent won''t be able to keep up with reading data from the disk. In this case, consider to decrease load on the vmagent or improve the disk throughput.'
summary: 'Persistent queue reads for instance {{`{{`}} $labels.instance {{`}}`}} are saturated'
condition: '{{ true }}'
Expand All @@ -89,7 +92,7 @@ rules:
severity: warning
- alert: SeriesLimitHourReached
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=88&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=88&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems.'
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} reached 90% of the limit'
condition: '{{ true }}'
Expand All @@ -98,7 +101,7 @@ rules:
severity: critical
- alert: SeriesLimitDayReached
annotations:
dashboard: '{{ index (($Values.grafana).ingress).hosts 0 }}/d/G7Z9GzMGz?viewPanel=90&var-instance={{`{{`}} $labels.instance {{`}}`}}'
dashboard: '{{ $host }}/d/G7Z9GzMGz?viewPanel=90&var-instance={{`{{`}} $labels.instance {{`}}`}}'
description: 'Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems.'
summary: 'Instance {{`{{`}} $labels.instance {{`}}`}} reached 90% of the limit'
condition: '{{ true }}'
Expand Down
Loading

0 comments on commit 3383b67

Please sign in to comment.