Skip to content

Commit

Permalink
Add alert rules to metacontroller-operator based on the KF093 spec (#124
Browse files Browse the repository at this point in the history
) (#133)

* Add alert rules to metacontroller-operator based on the KF093 spec

* Delete src/prometheus_alert_rules/unit_unavailable.rule

* add cos integration tests for alert rules

Co-authored-by: Robert Gildein <gildeinrobert@gmail.com>
  • Loading branch information
misohu and rgildein authored Oct 11, 2024
1 parent 4c6ff9a commit 44e3598
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 95 deletions.
3 changes: 3 additions & 0 deletions requirements-integration.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ pytest-operator
pyyaml
requests
tenacity
# Pin to >=0.4.0 because the reusable test infrastructure is on that version and above
# This prevents pip-compile from trying to pin an earlier version
charmed-kubeflow-chisme>=0.4.0
69 changes: 65 additions & 4 deletions requirements-integration.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@ aiohttp==3.8.5
# via -r requirements-integration.in
aiosignal==1.3.1
# via aiohttp
anyio==4.4.0
# via httpx
asttokens==2.4.0
# via stack-data
async-timeout==4.0.3
# via aiohttp
attrs==23.1.0
# via aiohttp
# via
# aiohttp
# jsonschema
backcall==0.2.0
# via ipython
bcrypt==4.0.1
Expand All @@ -22,12 +26,16 @@ cachetools==5.3.1
# via google-auth
certifi==2023.7.22
# via
# httpcore
# httpx
# kubernetes
# requests
cffi==1.15.1
# via
# cryptography
# pynacl
charmed-kubeflow-chisme==0.4.3
# via -r requirements-integration.in
charset-normalizer==3.2.0
# via
# aiohttp
Expand All @@ -38,8 +46,12 @@ decorator==5.1.1
# via
# ipdb
# ipython
deepdiff==6.2.1
# via charmed-kubeflow-chisme
exceptiongroup==1.1.3
# via pytest
# via
# anyio
# pytest
executing==1.2.0
# via stack-data
frozenlist==1.4.0
Expand All @@ -48,12 +60,22 @@ frozenlist==1.4.0
# aiosignal
google-auth==2.22.0
# via kubernetes
h11==0.14.0
# via httpcore
httpcore==1.0.5
# via httpx
httpx==0.27.0
# via lightkube
hvac==1.2.1
# via juju
idna==3.4
# via
# anyio
# httpx
# requests
# yarl
importlib-resources==6.4.3
# via jsonschema
iniconfig==2.0.0
# via pytest
ipdb==0.13.13
Expand All @@ -65,13 +87,21 @@ jedi==0.19.0
jinja2==3.1.2
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
# pytest-operator
jsonschema==4.17.3
# via serialized-data-interface
juju==3.2.2
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
# pytest-operator
kubernetes==27.2.0
# via juju
lightkube==0.15.3
# via charmed-kubeflow-chisme
lightkube-models==1.30.0.8
# via lightkube
macaroonbakery==1.3.1
# via juju
markupsafe==2.1.3
Expand All @@ -88,6 +118,12 @@ oauthlib==3.2.2
# via
# kubernetes
# requests-oauthlib
ops==2.15.0
# via
# charmed-kubeflow-chisme
# serialized-data-interface
ordered-set==4.1.0
# via deepdiff
packaging==23.1
# via pytest
paramiko==2.12.0
Expand All @@ -98,6 +134,8 @@ pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
pkgutil-resolve-name==1.3.10
# via jsonschema
pluggy==1.3.0
# via pytest
prompt-toolkit==3.0.39
Expand Down Expand Up @@ -132,6 +170,8 @@ pyrfc3339==1.1
# via
# juju
# macaroonbakery
pyrsistent==0.20.0
# via jsonschema
pytest==7.4.2
# via
# -r requirements-integration.in
Expand All @@ -150,18 +190,28 @@ pyyaml==6.0.1
# -r requirements-integration.in
# juju
# kubernetes
# lightkube
# ops
# pytest-operator
# serialized-data-interface
requests==2.31.0
# via
# -r requirements-integration.in
# hvac
# kubernetes
# macaroonbakery
# requests-oauthlib
# serialized-data-interface
requests-oauthlib==1.3.1
# via kubernetes
rsa==4.9
# via google-auth
ruamel-yaml==0.18.6
# via charmed-kubeflow-chisme
ruamel-yaml-clib==0.2.8
# via ruamel-yaml
serialized-data-interface==0.7.0
# via charmed-kubeflow-chisme
six==1.16.0
# via
# asttokens
Expand All @@ -171,10 +221,16 @@ six==1.16.0
# paramiko
# pymacaroons
# python-dateutil
sniffio==1.3.1
# via
# anyio
# httpx
stack-data==0.6.2
# via ipython
tenacity==8.2.3
# via -r requirements-integration.in
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
tomli==2.0.1
# via
# ipdb
Expand All @@ -187,6 +243,7 @@ traitlets==5.9.0
# matplotlib-inline
typing-extensions==4.7.1
# via
# anyio
# ipython
# typing-inspect
typing-inspect==0.9.0
Expand All @@ -199,8 +256,12 @@ urllib3==1.26.16
wcwidth==0.2.6
# via prompt-toolkit
websocket-client==1.6.3
# via kubernetes
# via
# kubernetes
# ops
websockets==8.1
# via juju
yarl==1.9.2
# via aiohttp
zipp==3.20.0
# via importlib-resources
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
groups:
- name: KubeflowMetacontrollerOperatorServices
rules:
- alert: KubeflowServiceDown
expr: up{} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}.
LABELS = {{ $labels }}

- alert: KubeflowServiceIsNotStable
expr: avg_over_time(up{}[10m]) < 0.5
for: 0m
labels:
severity: warning
annotations:
summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
{{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes.
LABELS = {{ $labels }}
10 changes: 0 additions & 10 deletions src/prometheus_alert_rules/unit_unavailable.rule

This file was deleted.

105 changes: 24 additions & 81 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
# Copyright 2021 Canonical Ltd.
# See LICENSE file for licensing details.

import json
import logging
from pathlib import Path

import pytest
import requests
import tenacity
import yaml
from charmed_kubeflow_chisme.testing import (
assert_alert_rules,
assert_metrics_endpoint,
deploy_and_assert_grafana_agent,
get_alert_rules,
)
from pytest_operator.plugin import OpsTest

logger = logging.getLogger(__name__)

METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
APP_NAME = "metacontroller-operator"
PROMETHEUS = "prometheus-k8s"
PROMETHEUS_CHANNEL = "1.0/stable"
GRAFANA = "grafana-k8s"
GRAFANA_CHANNEL = "1.0/stable"
PROMETHEUS_SCRAPE = "prometheus-scrape-config-k8s"
PROMETHEUS_SCRAPE_CHANNEL = "1.0/stable"


@pytest.mark.abort_on_fail
Expand Down Expand Up @@ -54,82 +51,28 @@ async def test_build_and_deploy_with_trust(ops_test: OpsTest):
), f"Application {app_name}.Unit {i_unit}.workload_status != active"
assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active"

# Deploying grafana-agent-k8s and add all relations
await deploy_and_assert_grafana_agent(
ops_test.model, APP_NAME, metrics=True, dashboard=True, logging=False
)

async def test_prometheus_grafana_integration(ops_test: OpsTest):
"""Deploy prometheus, grafana and required relations, then test the metrics."""
scrape_config = {"scrape_interval": "30s"}

await ops_test.juju(
"deploy",
PROMETHEUS,
"--channel",
PROMETHEUS_CHANNEL,
"--trust",
check=True,
)
await ops_test.juju(
"deploy",
GRAFANA,
"--channel",
GRAFANA_CHANNEL,
"--trust",
check=True,
)
await ops_test.model.deploy(
PROMETHEUS_SCRAPE,
channel=PROMETHEUS_SCRAPE_CHANNEL,
config=scrape_config,
)
async def test_metrics_enpoint(ops_test):
"""Test metrics_endpoints are defined in relation data bag and their accessibility.
This function gets all the metrics_endpoints from the relation data bag, checks if
they are available from the grafana-agent-k8s charm and finally compares them with the
ones provided to the function.
"""
app = ops_test.model.applications[APP_NAME]
await assert_metrics_endpoint(app, metrics_port=9999, metrics_path="/metrics")

await ops_test.model.add_relation(APP_NAME, PROMETHEUS_SCRAPE)
await ops_test.model.add_relation(
f"{PROMETHEUS}:grafana-dashboard", f"{GRAFANA}:grafana-dashboard"
)
await ops_test.model.add_relation(
f"{APP_NAME}:grafana-dashboard", f"{GRAFANA}:grafana-dashboard"
)
await ops_test.model.add_relation(
f"{PROMETHEUS}:metrics-endpoint",
f"{PROMETHEUS_SCRAPE}:metrics-endpoint",
)

await ops_test.model.wait_for_idle(status="active", timeout=60 * 20)

status = await ops_test.model.get_status()
prometheus_unit_ip = status["applications"][PROMETHEUS]["units"][f"{PROMETHEUS}/0"]["address"]
logger.info(f"Prometheus available at http://{prometheus_unit_ip}:9090")

for attempt in retry_for_5_attempts:
logger.info(
f"Testing prometheus deployment (attempt " f"{attempt.retry_state.attempt_number})"
)
with attempt:
r = requests.get(
f"http://{prometheus_unit_ip}:9090/api/v1/query?"
f'query=up{{juju_application="{APP_NAME}"}}'
)
response = json.loads(r.content.decode("utf-8"))
response_status = response["status"]
logger.info(f"Response status is {response_status}")
assert response_status == "success"

response_metric = response["data"]["result"][0]["metric"]
assert response_metric["juju_application"] == APP_NAME
assert response_metric["juju_model"] == ops_test.model_name

# Assert the unit is available by checking the query result
# The data is presented as a list [1707357912.349, '1'], where the
# first value is a timestamp and the second value is the state of the unit
# 1 means available, 0 means unavailable
assert response["data"]["result"][0]["value"][1] == "1"


# Helper to retry calling a function over 30 seconds or 5 attempts
retry_for_5_attempts = tenacity.Retrying(
stop=(tenacity.stop_after_attempt(5) | tenacity.stop_after_delay(30)),
wait=tenacity.wait_exponential(multiplier=1, min=1, max=10),
reraise=True,
)
async def test_alert_rules(ops_test):
"""Test check charm alert rules and rules defined in relation data bag."""
app = ops_test.model.applications[APP_NAME]
alert_rules = get_alert_rules()
logger.info("found alert_rules: %s", alert_rules)
await assert_alert_rules(app, alert_rules)


# TODO: Add test for charm removal
Expand Down

0 comments on commit 44e3598

Please sign in to comment.