Skip to content

Commit

Permalink
Fix prometheus rules and add unit-tests
Browse files Browse the repository at this point in the history
- The pod selector is a regex and should use a regex matches.
- Fixed calculation on REST error checks.

Also added unit-tests to validate our alerts to reduce the risk of such
bugs in the future.

Signed-off-by: Daniel Belenky <[email protected]>
  • Loading branch information
Daniel Belenky committed Mar 19, 2020
1 parent 27a1cec commit ac36a1c
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 21 deletions.
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ build-prom-spec-dumper:
hack/dockerized "go build -o rule-spec-dumper ./hack/prom-rule-ci/rule-spec-dumper.go"

prom-rules-verify: build-prom-spec-dumper
./hack/prom-rule-ci/verify-rules.sh ${current-dir}/rule-spec-dumper
./hack/prom-rule-ci/verify-rules.sh \
"${current-dir}/rule-spec-dumper" \
"${current-dir}/hack/prom-rule-ci/prom-rules-tests.yaml"

olm-push:
hack/dockerized "DOCKER_TAG=${DOCKER_TAG} CSV_VERSION=${CSV_VERSION} QUAY_USERNAME=${QUAY_USERNAME} \
Expand Down
138 changes: 138 additions & 0 deletions hack/prom-rule-ci/prom-rules-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
---
rule_files:
- /tmp/rules.verify

group_eval_order:
- kubevirt.rules

tests:
# All components are down
- interval: 1m
input_series:
- series: 'up{namespace="ci", pod="virt-api-1"}'
values: "0 0 0 0 0 0"
- series: 'up{namespace="ci", pod="virt-controller-1"}'
values: "0 0 0 0 0 0"
- series: 'up{namespace="ci", pod="virt-operator-1"}'
values: "0 0 0 0 0 0"

alert_rule_test:
- eval_time: 5m
alertname: VirtAPIDown
exp_alerts:
- exp_annotations:
summary: "All virt-api servers are down."
- eval_time: 5m
alertname: VirtControllerDown
exp_alerts:
- exp_annotations:
summary: "No running virt-controller was detected for the last 5 min."
- eval_time: 5m
alertname: VirtOperatorDown
exp_alerts:
- exp_annotations:
summary: "All virt-operator servers are down."

# Some virt controllers are not ready
- interval: 1m
input_series:
- series: 'ready_virt_controller{namespace="ci", pod="virt-controller-1"}'
values: "1 1 1 1 1 1"
- series: 'ready_virt_controller{namespace="ci", pod="virt-controller-2"}'
values: "0 0 0 0 0 0"
- series: 'up{namespace="ci", pod="virt-controller-1"}'
values: "1 1 1 1 1 1"
- series: 'up{namespace="ci", pod="virt-controller-2"}'
values: "1 1 1 1 1 1"

alert_rule_test:
- eval_time: 5m
alertname: LowReadyVirtControllersCount
exp_alerts:
- exp_annotations:
summary: "Some virt controllers are running but not ready."

# All virt controllers are not ready
- interval: 1m
input_series:
- series: 'ready_virt_controller{namespace="ci", pod="virt-controller-1"}'
values: "0 0 0 0 0 0"

alert_rule_test:
- eval_time: 5m
alertname: NoReadyVirtController
exp_alerts:
- exp_annotations:
summary: "No ready virt-controller was detected for the last 5 min."

- interval: 1m
input_series:
- series: 'ready_virt_controller{namespace="ci", pod="virt-controller-1"}'
values: "0 0 0 0 0 0"

alert_rule_test:
- eval_time: 5m
alertname: NoReadyVirtController
exp_alerts:
- exp_annotations:
summary: "No ready virt-controller was detected for the last 5 min."

# High REST errors
- interval: 1m
input_series:
- series: 'rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="200"}'
values: "2 2 2 2 2 2 2 2 2 2"
- series: 'rest_client_requests_total{namespace="ci", pod="virt-controller-1", code="400"}'
values: "10 10 10 10 10 10 10 10 10 10"
- series: 'rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="200"}'
values: "2 2 2 2 2 2 2 2 2 2"
- series: 'rest_client_requests_total{namespace="ci", pod="virt-operator-1", code="400"}'
values: "10 10 10 10 10 10 10 10 10 20"
- series: 'rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="200"}'
values: "2 2 2 2 2 2 2 2 2 2"
- series: 'rest_client_requests_total{namespace="ci", pod="virt-handler-1", code="500"}'
values: "10 10 10 10 10 10 10 10 10 20"

alert_rule_test:
- eval_time: 5m
alertname: VirtControllerRESTErrorsHigh
exp_alerts:
- exp_annotations:
summary: "More than 5% of the rest calls failed in virt-controller for the last hour"
exp_labels:
pod: "virt-controller-1"
- eval_time: 5m
alertname: VirtControllerRESTErrorsBurst
exp_alerts:
- exp_annotations:
summary: "More than 80% of the rest calls failed in virt-controller for the last 5 minutes"
exp_labels:
pod: "virt-controller-1"
- eval_time: 5m
alertname: VirtOperatorRESTErrorsHigh
exp_alerts:
- exp_annotations:
summary: "More than 5% of the rest calls failed in virt-operator for the last hour"
exp_labels:
pod: "virt-operator-1"
- eval_time: 5m
alertname: VirtOperatorRESTErrorsBurst
exp_alerts:
- exp_annotations:
summary: "More than 80% of the rest calls failed in virt-operator for the last 5 minutes"
exp_labels:
pod: "virt-operator-1"
- eval_time: 5m
alertname: VirtHandlerRESTErrorsHigh
exp_alerts:
- exp_annotations:
summary: "More than 5% of the rest calls failed in virt-handler for the last hour"
exp_labels:
pod: "virt-handler-1"
- eval_time: 5m
alertname: VirtHandlerRESTErrorsBurst
exp_alerts:
- exp_annotations:
summary: "More than 80% of the rest calls failed in virt-handler for the last 5 minutes"
exp_labels:
pod: "virt-handler-1"
34 changes: 28 additions & 6 deletions hack/prom-rule-ci/verify-rules.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#!/bin/bash -xe
#!/bin/bash -e
#
# This file is part of the KubeVirt project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
Expand All @@ -27,18 +26,41 @@ function cleanup() {
done
}

function lint() {
local target_file="${1:?}"

docker run --rm --entrypoint=/bin/promtool \
-v "$target_file":/tmp/rules.verify:ro "$PROM_IMAGE" \
check rules /tmp/rules.verify
}

function unit_test() {
local target_file="${1:?}"
local tests_file="${2:?}"

docker run --rm --entrypoint=/bin/promtool \
-v "$tests_file":/tmp/rules.test:ro \
-v "$target_file":/tmp/rules.verify:ro \
"$PROM_IMAGE" \
test rules /tmp/rules.test
}

function main() {
local prom_spec_dumper="${1:?}"
local tests_file="${2:?}"
local target_file

target_file="$(mktemp --tmpdir -u tmp.prom_rules.XXXXX)"
trap "cleanup $target_file" RETURN EXIT INT

"$prom_spec_dumper" "$target_file"

docker run --rm --entrypoint=/bin/promtool \
-v "$target_file":/tmp/rules.verify:ro "$PROM_IMAGE" \
check rules /tmp/rules.verify
echo "INFO: Rules file content:"
cat "$target_file"
echo

lint "$target_file"
unit_test "$target_file" "$tests_file"
}

main "$@"
28 changes: 14 additions & 14 deletions pkg/virt-operator/creation/components/crds.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ func NewPrometheusRuleSpec(ns string) *promv1.PrometheusRuleSpec {
{
Record: "num_of_running_virt_api_servers",
Expr: intstr.FromString(
fmt.Sprintf("sum(up{namespace='%s', pod='virt-api-.*'})", ns),
fmt.Sprintf("sum(up{namespace='%s', pod=~'virt-api-.*'})", ns),
),
},
{
Expand All @@ -340,7 +340,7 @@ func NewPrometheusRuleSpec(ns string) *promv1.PrometheusRuleSpec {
{
Record: "num_of_running_virt_controllers",
Expr: intstr.FromString(
fmt.Sprintf("sum(up{pod='virt-controller-.*', namespace='%s'})", ns),
fmt.Sprintf("sum(up{pod=~'virt-controller-.*', namespace='%s'})", ns),
),
},
{
Expand Down Expand Up @@ -384,25 +384,25 @@ func NewPrometheusRuleSpec(ns string) *promv1.PrometheusRuleSpec {
{
Record: "vec_by_virt_controllers_all_client_rest_requests_in_last_hour",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-controller-.*', namespace='%s', code=~'[2][0-9][0-9]'}[60m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-controller-.*', namespace='%s'}[60m]))", ns),
),
},
{
Record: "vec_by_virt_controllers_failed_client_rest_requests_in_last_hour",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-controller-.*', namespace='%s', code=~'[^2][0-9][0-9]'}[60m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-controller-.*', namespace='%s', code=~'(4|5)[0-9][0-9]'}[60m]))", ns),
),
},
{
Record: "vec_by_virt_controllers_all_client_rest_requests_in_last_5m",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-controller-.*', namespace='%s', code=~'[2][0-9][0-9]'}[5m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-controller-.*', namespace='%s'}[5m]))", ns),
),
},
{
Record: "vec_by_virt_controllers_failed_client_rest_requests_in_last_5m",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-controller-.*', namespace='%s', code=~'[^2][0-9][0-9]'}[5m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-controller-.*', namespace='%s', code=~'(4|5)[0-9][0-9]'}[5m]))", ns),
),
},
{
Expand All @@ -424,7 +424,7 @@ func NewPrometheusRuleSpec(ns string) *promv1.PrometheusRuleSpec {
{
Record: "num_of_running_virt_operators",
Expr: intstr.FromString(
fmt.Sprintf("sum(up{namespace='%s', pod='virt-operator-.*'})", ns),
fmt.Sprintf("sum(up{namespace='%s', pod=~'virt-operator-.*'})", ns),
),
},
{
Expand All @@ -446,25 +446,25 @@ func NewPrometheusRuleSpec(ns string) *promv1.PrometheusRuleSpec {
{
Record: "vec_by_virt_operators_all_client_rest_requests_in_last_hour",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-operator-.*', namespace='%s', code=~'[2][0-9][0-9]'}[60m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-operator-.*', namespace='%s'}[60m]))", ns),
),
},
{
Record: "vec_by_virt_operators_failed_client_rest_requests_in_last_hour",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-operator-.*', namespace='%s', code=~'[^2][0-9][0-9]'}[60m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-operator-.*', namespace='%s', code=~'(4|5)[0-9][0-9]'}[60m]))", ns),
),
},
{
Record: "vec_by_virt_operators_all_client_rest_requests_in_last_5m",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-operator-.*', namespace='%s', code=~'[2][0-9][0-9]'}[5m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-operator-.*', namespace='%s'}[5m]))", ns),
),
},
{
Record: "vec_by_virt_operators_failed_client_rest_requests_in_last_5m",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod='virt-operator-.*', namespace='%s', code=~'[^2][0-9][0-9]'}[5m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-operator-.*', namespace='%s', code=~'(4|5)[0-9][0-9]'}[5m]))", ns),
),
},
{
Expand Down Expand Up @@ -549,21 +549,21 @@ func NewPrometheusRuleSpec(ns string) *promv1.PrometheusRuleSpec {
{
Record: "vec_by_virt_handlers_failed_client_rest_requests_in_last_5m",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-handler-.*', namespace='%s', code=~'[^2][0-9][0-9]'}[5m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-handler-.*', namespace='%s', code=~'(4|5)[0-9][0-9]'}[5m]))", ns),
),
},
{
Record: "vec_by_virt_handlers_failed_client_rest_requests_in_last_hour",
Expr: intstr.FromString(
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-handler-.*', namespace='%s', code=~'[^2][0-9][0-9]'}[60m]))", ns),
fmt.Sprintf("sum by (pod) (sum_over_time(rest_client_requests_total{pod=~'virt-handler-.*', namespace='%s', code=~'(4|5)[0-9][0-9]'}[60m]))", ns),
),
},
{
Alert: "VirtHandlerRESTErrorsHigh",
Expr: intstr.FromString("(vec_by_virt_handlers_failed_client_rest_requests_in_last_hour / vec_by_virt_handlers_all_client_rest_requests_in_last_hour) >= 0.05"),
For: "5m",
Annotations: map[string]string{
"summary": "More than 5% of the rest calls failed in virt-operator for the last hour",
"summary": "More than 5% of the rest calls failed in virt-handler for the last hour",
},
},
{
Expand Down

0 comments on commit ac36a1c

Please sign in to comment.