Skip to content

Commit

Permalink
Add alert rules to minio based on the KF093 spec (#184)
Browse files Browse the repository at this point in the history
* Add alert rules to minio based on the KF093 spec

* Delete src/prometheus_alert_rules/unit_unavailable.rule

* add cos integration tests for alert rules
  • Loading branch information
rgildein authored and misohu committed Oct 8, 2024
1 parent de6f16e commit 5724367
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 105 deletions.
3 changes: 3 additions & 0 deletions requirements-integration.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ requests
selenium
selenium-wire
tenacity
# Pin to >=0.4.0 because the reusable test infrastructure is on that version and above
# This prevents pip-compile from trying to pin an earlier version
charmed-kubeflow-chisme>=0.4.0
71 changes: 64 additions & 7 deletions requirements-integration.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
#
# pip-compile requirements-integration.in
#
appnope==0.1.4
# via ipython
anyio==4.4.0
# via httpx
asttokens==2.4.0
# via stack-data
attrs==23.1.0
# via
# jsonschema
# outcome
# trio
backcall==0.2.0
Expand All @@ -24,6 +25,8 @@ cachetools==5.3.1
# via google-auth
certifi==2023.7.22
# via
# httpcore
# httpx
# kubernetes
# requests
# selenium
Expand All @@ -32,6 +35,8 @@ cffi==1.15.1
# via
# cryptography
# pynacl
charmed-kubeflow-chisme==0.4.3
# via -r requirements-integration.in
charset-normalizer==3.2.0
# via requests
cryptography==41.0.3
Expand All @@ -42,8 +47,11 @@ decorator==5.1.1
# via
# ipdb
# ipython
deepdiff==6.2.1
# via charmed-kubeflow-chisme
exceptiongroup==1.1.3
# via
# anyio
# pytest
# trio
# trio-websocket
Expand All @@ -52,11 +60,17 @@ executing==1.2.0
google-auth==2.17.3
# via kubernetes
h11==0.14.0
# via wsproto
# via
# httpcore
# wsproto
h2==4.1.0
# via selenium-wire
hpack==4.0.0
# via h2
httpcore==1.0.5
# via httpx
httpx==0.27.0
# via lightkube
hvac==1.2.0
# via juju
hyperframe==6.0.1
Expand All @@ -65,8 +79,12 @@ hyperframe==6.0.1
# selenium-wire
idna==3.4
# via
# anyio
# httpx
# requests
# trio
importlib-resources==6.4.3
# via jsonschema
iniconfig==2.0.0
# via pytest
ipdb==0.13.13
Expand All @@ -76,15 +94,24 @@ ipython==8.12.2
jedi==0.19.0
# via ipython
jinja2==3.1.2
# via pytest-operator
# via
# charmed-kubeflow-chisme
# pytest-operator
jsonschema==4.17.3
# via serialized-data-interface
juju==3.2.2
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
# pytest-operator
kaitaistruct==0.10
# via selenium-wire
kubernetes==27.2.0
# via juju
lightkube==0.15.3
# via charmed-kubeflow-chisme
lightkube-models==1.30.0.8
# via lightkube
macaroonbakery==1.3.1
# via juju
markupsafe==2.1.3
Expand All @@ -97,6 +124,12 @@ oauthlib==3.2.2
# via
# kubernetes
# requests-oauthlib
ops==2.15.0
# via
# charmed-kubeflow-chisme
# serialized-data-interface
ordered-set==4.1.0
# via deepdiff
outcome==1.2.0
# via trio
packaging==23.1
Expand All @@ -109,6 +142,8 @@ pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
pkgutil-resolve-name==1.3.10
# via jsonschema
pluggy==1.3.0
# via pytest
prompt-toolkit==3.0.39
Expand Down Expand Up @@ -148,6 +183,8 @@ pyrfc3339==1.1
# via
# juju
# macaroonbakery
pyrsistent==0.20.0
# via jsonschema
pysocks==1.7.1
# via
# selenium-wire
Expand All @@ -170,24 +207,34 @@ pyyaml==6.0.1
# -r requirements-integration.in
# juju
# kubernetes
# lightkube
# ops
# pytest-operator
# serialized-data-interface
requests==2.31.0
# via
# -r requirements-integration.in
# hvac
# kubernetes
# macaroonbakery
# requests-oauthlib
# serialized-data-interface
requests-oauthlib==1.3.1
# via kubernetes
rsa==4.9
# via google-auth
ruamel-yaml==0.18.6
# via charmed-kubeflow-chisme
ruamel-yaml-clib==0.2.8
# via ruamel-yaml
selenium==4.12.0
# via
# -r requirements-integration.in
# selenium-wire
selenium-wire==5.1.0
# via -r requirements-integration.in
serialized-data-interface==0.7.0
# via charmed-kubeflow-chisme
six==1.16.0
# via
# asttokens
Expand All @@ -198,13 +245,18 @@ six==1.16.0
# pymacaroons
# python-dateutil
sniffio==1.3.0
# via trio
# via
# anyio
# httpx
# trio
sortedcontainers==2.4.0
# via trio
stack-data==0.6.2
# via ipython
tenacity==8.2.3
# via -r requirements-integration.in
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
tomli==2.0.1
# via
# ipdb
Expand All @@ -223,6 +275,7 @@ trio-websocket==0.10.4
# via selenium
typing-extensions==4.7.1
# via
# anyio
# ipython
# typing-inspect
typing-inspect==0.9.0
Expand All @@ -235,12 +288,16 @@ urllib3[socks]==2.0.4
wcwidth==0.2.6
# via prompt-toolkit
websocket-client==1.6.2
# via kubernetes
# via
# kubernetes
# ops
websockets==8.1
# via juju
wsproto==1.2.0
# via
# selenium-wire
# trio-websocket
zipp==3.20.0
# via importlib-resources
zstandard==0.21.0
# via selenium-wire
24 changes: 24 additions & 0 deletions src/prometheus_alert_rules/KubeflowMinioServices.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
groups:
- name: KubeflowMinioServices
rules:
- alert: KubeflowServiceDown
expr: up{} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}.
LABELS = {{ $labels }}

- alert: KubeflowServiceIsNotStable
expr: avg_over_time(up{}[10m]) < 0.5
for: 0m
labels:
severity: warning
annotations:
summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
{{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes.
LABELS = {{ $labels }}
10 changes: 0 additions & 10 deletions src/prometheus_alert_rules/unit_unavailable.rule

This file was deleted.

Loading

0 comments on commit 5724367

Please sign in to comment.