Skip to content

Commit

Permalink
Add alert rules to mlflow-server based on the KF093 spec (#262)
Browse files Browse the repository at this point in the history
* Add alert rules to mlflow-server based on the KF093 spec

* Delete src/prometheus_alert_rules/unit_unavailable.rule

* Use chisme to test cos integration
  • Loading branch information
rgildein authored Aug 21, 2024
1 parent 8bbf84c commit 9f32fb6
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 166 deletions.
5 changes: 4 additions & 1 deletion requirements-integration.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,7 @@ minio
mlflow
pytest-operator
requests
-r requirements.txt
lightkube
lightkube-models>=1.25.4.4
# This is required due to the abstraction of cos integration
charmed-kubeflow-chisme>=0.4.0
106 changes: 22 additions & 84 deletions requirements-integration.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,13 @@ aiosignal==1.3.1
alembic==1.12.0
# via mlflow
anyio==3.7.1
# via
# -r requirements.txt
# httpcore
appnope==0.1.4
# via ipython
# via httpcore
asttokens==2.4.0
# via stack-data
async-timeout==4.0.3
# via aiohttp
attrs==23.1.0
# via
# -r requirements.txt
# aiohttp
# jsonschema
backcall==0.2.0
Expand All @@ -31,18 +26,10 @@ bcrypt==4.0.1
# via paramiko
blinker==1.6.2
# via flask
boto3==1.28.25
# via -r requirements.txt
botocore==1.31.25
# via
# -r requirements.txt
# boto3
# s3transfer
cachetools==5.3.1
# via google-auth
certifi==2023.7.22
# via
# -r requirements.txt
# httpcore
# httpx
# kubernetes
Expand All @@ -52,11 +39,10 @@ cffi==1.15.1
# via
# cryptography
# pynacl
charmed-kubeflow-chisme==0.2.0
# via -r requirements.txt
charmed-kubeflow-chisme==0.4.3
# via -r requirements-integration.in
charset-normalizer==3.2.0
# via
# -r requirements.txt
# aiohttp
# requests
click==8.1.7
Expand All @@ -79,16 +65,13 @@ decorator==5.1.1
# ipdb
# ipython
deepdiff==6.2.1
# via
# -r requirements.txt
# charmed-kubeflow-chisme
# via charmed-kubeflow-chisme
docker==6.1.3
# via mlflow
entrypoints==0.4
# via mlflow
exceptiongroup==1.1.2
# via
# -r requirements.txt
# anyio
# pytest
executing==1.2.0
Expand All @@ -107,25 +90,20 @@ gitpython==3.1.35
# via mlflow
google-auth==2.22.0
# via kubernetes
greenlet==3.0.3
# via sqlalchemy
gunicorn==21.2.0
# via mlflow
h11==0.14.0
# via
# -r requirements.txt
# httpcore
# via httpcore
httpcore==0.17.3
# via
# -r requirements.txt
# httpx
# via httpx
httpx==0.24.1
# via
# -r requirements.txt
# lightkube
# via lightkube
hvac==1.2.0
# via juju
idna==3.4
# via
# -r requirements.txt
# anyio
# httpx
# requests
Expand All @@ -138,7 +116,6 @@ importlib-metadata==6.8.0
# mlflow
importlib-resources==6.0.1
# via
# -r requirements.txt
# alembic
# jsonschema
# matplotlib
Expand All @@ -155,37 +132,30 @@ jedi==0.19.0
jinja2==3.1.2
# via
# -r requirements-integration.in
# -r requirements.txt
# charmed-kubeflow-chisme
# flask
# mlflow
# pytest-operator
jmespath==1.0.1
# via
# -r requirements.txt
# boto3
# botocore
joblib==1.3.2
# via scikit-learn
jsonschema==4.17.3
# via
# -r requirements.txt
# serialized-data-interface
# via serialized-data-interface
juju==3.2.2
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
# pytest-operator
kiwisolver==1.4.5
# via matplotlib
kubernetes==27.2.0
# via juju
lightkube==0.14.0
lightkube==0.15.3
# via
# -r requirements.txt
# -r requirements-integration.in
# charmed-kubeflow-chisme
lightkube-models==1.27.1.4
lightkube-models==1.30.0.8
# via
# -r requirements.txt
# -r requirements-integration.in
# lightkube
macaroonbakery==1.3.1
# via juju
Expand All @@ -195,7 +165,6 @@ markdown==3.4.4
# via mlflow
markupsafe==2.1.3
# via
# -r requirements.txt
# jinja2
# mako
# werkzeug
Expand Down Expand Up @@ -227,17 +196,12 @@ oauthlib==3.2.2
# databricks-cli
# kubernetes
# requests-oauthlib
oci-image==1.0.0
# via -r requirements.txt
ops==2.14.0
# via
# -r requirements.txt
# charmed-kubeflow-chisme
# serialized-data-interface
ordered-set==4.1.0
# via
# -r requirements.txt
# deepdiff
# via deepdiff
packaging==23.1
# via
# docker
Expand All @@ -258,9 +222,7 @@ pickleshare==0.7.5
pillow==10.0.0
# via matplotlib
pkgutil-resolve-name==1.3.10
# via
# -r requirements.txt
# jsonschema
# via jsonschema
pluggy==1.3.0
# via pytest
prompt-toolkit==3.0.39
Expand Down Expand Up @@ -304,9 +266,7 @@ pyrfc3339==1.1
# juju
# macaroonbakery
pyrsistent==0.19.3
# via
# -r requirements.txt
# jsonschema
# via jsonschema
pytest==7.4.2
# via
# pytest-asyncio
Expand All @@ -317,8 +277,6 @@ pytest-operator==0.29.0
# via -r requirements-integration.in
python-dateutil==2.8.2
# via
# -r requirements.txt
# botocore
# kubernetes
# matplotlib
# pandas
Expand All @@ -329,7 +287,6 @@ pytz==2023.3.post1
# pyrfc3339
pyyaml==6.0.1
# via
# -r requirements.txt
# juju
# kubernetes
# lightkube
Expand All @@ -342,7 +299,6 @@ querystring-parser==1.2.4
requests==2.31.0
# via
# -r requirements-integration.in
# -r requirements.txt
# databricks-cli
# docker
# hvac
Expand All @@ -356,30 +312,19 @@ requests-oauthlib==1.3.1
rsa==4.9
# via google-auth
ruamel-yaml==0.17.32
# via
# -r requirements.txt
# charmed-kubeflow-chisme
# via charmed-kubeflow-chisme
ruamel-yaml-clib==0.2.7
# via
# -r requirements.txt
# ruamel-yaml
s3transfer==0.6.1
# via
# -r requirements.txt
# boto3
# via ruamel-yaml
scikit-learn==1.3.0
# via mlflow
scipy==1.10.1
# via
# mlflow
# scikit-learn
serialized-data-interface==0.7.0
# via
# -r requirements.txt
# charmed-kubeflow-chisme
# via charmed-kubeflow-chisme
six==1.16.0
# via
# -r requirements.txt
# asttokens
# databricks-cli
# google-auth
Expand All @@ -393,7 +338,6 @@ smmap==5.0.0
# via gitdb
sniffio==1.3.0
# via
# -r requirements.txt
# anyio
# httpcore
# httpx
Expand All @@ -408,9 +352,7 @@ stack-data==0.6.2
tabulate==0.9.0
# via databricks-cli
tenacity==8.2.2
# via
# -r requirements.txt
# charmed-kubeflow-chisme
# via charmed-kubeflow-chisme
threadpoolctl==3.2.0
# via scikit-learn
tomli==2.0.1
Expand All @@ -435,8 +377,6 @@ tzdata==2023.3
# via pandas
urllib3==1.26.16
# via
# -r requirements.txt
# botocore
# databricks-cli
# docker
# google-auth
Expand All @@ -447,7 +387,6 @@ wcwidth==0.2.6
# via prompt-toolkit
websocket-client==1.6.1
# via
# -r requirements.txt
# docker
# kubernetes
# ops
Expand All @@ -459,6 +398,5 @@ yarl==1.9.2
# via aiohttp
zipp==3.16.2
# via
# -r requirements.txt
# importlib-metadata
# importlib-resources
24 changes: 24 additions & 0 deletions src/prometheus_alert_rules/KubeflowMlflowServerServices.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
groups:
- name: KubeflowMlflowServerServices
rules:
- alert: KubeflowServiceDown
expr: up{} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}.
LABELS = {{ $labels }}

- alert: KubeflowServiceIsNotStable
expr: avg_over_time(up{}[10m]) < 0.5
for: 0m
labels:
severity: warning
annotations:
summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
{{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes.
LABELS = {{ $labels }}
10 changes: 0 additions & 10 deletions src/prometheus_alert_rules/unit_unavailable.rule

This file was deleted.

Loading

0 comments on commit 9f32fb6

Please sign in to comment.