Skip to content

Commit

Permalink
Add alert rules to knative-operator based on the KF093 spec (#215) (#229
Browse files Browse the repository at this point in the history
)

* Add alert rules to knative-operator based on the KF093 spec

* add cos integration tests for alert rules

Co-authored-by: Robert Gildein <[email protected]>
  • Loading branch information
misohu and rgildein authored Oct 11, 2024
1 parent fcdb75a commit 0b33d80
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
groups:
- name: KubeflowKnativeOperatorServices
rules:
- alert: KubeflowServiceDown
expr: up{} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}.
LABELS = {{ $labels }}

- alert: KubeflowServiceIsNotStable
expr: avg_over_time(up{}[10m]) < 0.5
for: 0m
labels:
severity: warning
annotations:
summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
{{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes.
LABELS = {{ $labels }}
10 changes: 10 additions & 0 deletions charms/knative-operator/tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import pytest
import yaml
from charmed_kubeflow_chisme.testing import (
assert_alert_rules,
assert_logging,
assert_metrics_endpoint,
deploy_and_assert_grafana_agent,
get_alert_rules,
)
from pytest_operator.plugin import OpsTest

Expand Down Expand Up @@ -65,3 +67,11 @@ async def test_metrics_enpoint(ops_test):
app = ops_test.model.applications[APP_NAME]
# Note(rgildein): Without otel-collector relation we will not see otel as terget.
await assert_metrics_endpoint(app, metrics_port=9090, metrics_path="/metrics")


async def test_alert_rules(ops_test):
"""Test check charm alert rules and rules defined in relation data bag."""
app = ops_test.model.applications[APP_NAME]
alert_rules = get_alert_rules()
log.info("found alert_rules: %s", alert_rules)
await assert_alert_rules(app, alert_rules)
12 changes: 12 additions & 0 deletions tests/test_cos_integration.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
import logging
from pathlib import Path

import pytest
from charmed_kubeflow_chisme.testing import (
assert_alert_rules,
assert_logging,
assert_metrics_endpoint,
deploy_and_assert_grafana_agent,
get_alert_rules,
)
from pytest_operator.plugin import OpsTest
from test_bundle import KNATIVE_OPERATOR_RESOURCES
Expand All @@ -16,6 +19,7 @@
# knative-operator is the charm that actually talks to prometheus
# to configure the OpenTelemetry collector to be scraped
APP_NAME = "knative-operator"
ALERT_RULES_PATH = Path(f"./charms/{APP_NAME}/src/prometheus_alert_rules")


@pytest.mark.abort_on_fail
Expand Down Expand Up @@ -93,3 +97,11 @@ async def test_metrics_enpoint(ops_test):
await ops_test.model.wait_for_idle(raise_on_blocked=False, timeout=60 * 5, idle_period=60)

await assert_metrics_endpoint(app, metrics_port=8889, metrics_path="/metrics")


async def test_alert_rules(ops_test):
"""Test check charm alert rules and rules defined in relation data bag."""
app = ops_test.model.applications[APP_NAME]
alert_rules = get_alert_rules(ALERT_RULES_PATH)
log.info("found alert_rules: %s", alert_rules)
await assert_alert_rules(app, alert_rules)

0 comments on commit 0b33d80

Please sign in to comment.