From 8631493e8e0b0e526950b354d4758fd008e16f7d Mon Sep 17 00:00:00 2001
From: Daniela Plascencia <daniela.plascencia@canonical.com>
Date: Wed, 14 Feb 2024 16:36:03 +0100
Subject: [PATCH] fix: correct metrics path for MetricsEndpointProvider (#236)
 (#240)

* fix: correctly configure one scrape job to avoid firig alerts

The metrics endpoint configuration had two scrape jobs, one for the
regular metrics endpoint, and a second one based on a dynamic list of
targets. The latter was causing the prometheus scraper to try and scrape
metrics from *:80/metrics, which is not a valid endpoint. This was
causing the UnitsUnavailable alert to fire constantly because that job
was reporting back that the endpoint was not available.
This new job was introduced by canonical/seldon-core-operator#94
with no apparent justification. Because the seldon charm has changed
since that PR, and the endpoint it is configuring is not valid, this
commit will remove the extra job.

This commit also refactors the MetricsEndpointProvider instantiation and
removes the metrics-port config option as this value should not change.

Finally, this commit changes the alert rule interval from 0m to 5m, as
this interval is more appropriate for production environments.

Part of canonical/bundle-kubeflow#564

* tests: add an assertion for checking unit is available

The test_prometheus_grafana_integration test case was doing queries to prometheus
and checking the request returned successfully and that the application name and model
was listed correctly. To make this test case more accurately, we can add an assertion that
also checks that the unit is available, this way we avoid issues like the one described in
canonical/bundle-kubeflow#564.

Part of canonical/bundle-kubeflow#564
---
 config.yaml                                   |  4 ---
 src/charm.py                                  | 36 +++++++++----------
 .../unit_unavailable.rule                     |  2 +-
 tests/integration/test_charm.py               |  6 ++++
 tests/unit/test_operator.py                   |  4 +--
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/config.yaml b/config.yaml
index 0661a61..2cee600 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,8 +1,4 @@
 options:
-  metrics-port:
-    type: string
-    default: '8080'
-    description: Metrics port
   webhook-port:
     type: string
     default: '4443'
diff --git a/src/charm.py b/src/charm.py
index 30be0ca..52c529c 100755
--- a/src/charm.py
+++ b/src/charm.py
@@ -64,6 +64,8 @@
     "configmap__predictor__tempo_server__v2",
 ]
 DEFAULT_IMAGES_FILE = "src/default-custom-images.json"
+METRICS_PATH = "/metrics"
+METRICS_PORT = "8080"
 
 with open(DEFAULT_IMAGES_FILE, "r") as json_file:
     DEFAULT_IMAGES = json.load(json_file)
@@ -84,7 +86,6 @@ def __init__(self, *args):
         self._namespace = self.model.name
         self._lightkube_field_manager = "lightkube"
         self._name = self.model.app.name
-        self._metrics_port = self.model.config["metrics-port"]
         self._webhook_port = self.model.config["webhook-port"]
         self._manager_create_resources = self.model.config["manager-create-resources"]
         self._manager_log_level = self.model.config["manager-log-level"]
@@ -104,7 +105,7 @@ def __init__(self, *args):
         self._exec_command = (
             "/manager "
             "--enable-leader-election "
-            f"--metrics-addr=:{self._metrics_port} "
+            f"--metrics-addr=:{METRICS_PORT} "
             f"--webhook-port {self._webhook_port} "
             f"--log-level={self._manager_log_level} "
             f"--leader-election-id={self._manager_leader_election_id} "
@@ -131,7 +132,7 @@ def __init__(self, *args):
         self._crd_resource_handler = None
         self._configmap_resource_handler = None
 
-        metrics_port = ServicePort(int(self._metrics_port), name="metrics-port")
+        metrics_port = ServicePort(int(METRICS_PORT), name="metrics-port")
         webhook_port = ServicePort(int(self._webhook_port), name="webhook-port")
         self.service_patcher = KubernetesServicePatch(
             self,
@@ -139,29 +140,16 @@ def __init__(self, *args):
             service_name=f"{self.model.app.name}",
         )
 
-        # setup events to be handled by main event handler
-        self.framework.observe(self.on.config_changed, self._on_event)
-        for rel in self.model.relations.keys():
-            self.framework.observe(self.on[rel].relation_changed, self._on_event)
-
-        # setup events to be handled by specific event handlers
-        self.framework.observe(self.on.install, self._on_install)
-        self.framework.observe(self.on.upgrade_charm, self._on_upgrade)
-        self.framework.observe(self.on.seldon_core_pebble_ready, self._on_pebble_ready)
-        self.framework.observe(self.on.remove, self._on_remove)
-        self.framework.observe(self.on.stop, self._on_stop)
-
         # Prometheus related config
         self.prometheus_provider = MetricsEndpointProvider(
             charm=self,
             relation_name="metrics-endpoint",
             jobs=[
                 {
-                    "metrics_path": self.config["executor-server-metrics-port-name"],
-                    "static_configs": [{"targets": ["*:{}".format(self.config["metrics-port"])]}],
+                    "metrics_path": METRICS_PATH,
+                    "static_configs": [{"targets": ["*:{}".format(METRICS_PORT)]}],
                 }
             ],
-            lookaside_jobs_callable=self.return_list_of_running_models,
         )
 
         # Dashboard related config (Grafana)
@@ -170,6 +158,18 @@ def __init__(self, *args):
             relation_name="grafana-dashboard",
         )
 
+        # setup events to be handled by main event handler
+        self.framework.observe(self.on.config_changed, self._on_event)
+        for rel in self.model.relations.keys():
+            self.framework.observe(self.on[rel].relation_changed, self._on_event)
+
+        # setup events to be handled by specific event handlers
+        self.framework.observe(self.on.install, self._on_install)
+        self.framework.observe(self.on.upgrade_charm, self._on_upgrade)
+        self.framework.observe(self.on.seldon_core_pebble_ready, self._on_pebble_ready)
+        self.framework.observe(self.on.remove, self._on_remove)
+        self.framework.observe(self.on.stop, self._on_stop)
+
     @property
     def container(self):
         """Return container."""
diff --git a/src/prometheus_alert_rules/unit_unavailable.rule b/src/prometheus_alert_rules/unit_unavailable.rule
index 631396a..aa09943 100644
--- a/src/prometheus_alert_rules/unit_unavailable.rule
+++ b/src/prometheus_alert_rules/unit_unavailable.rule
@@ -1,6 +1,6 @@
 alert: SeldonUnitIsUnavailable
 expr: up < 1
-for: 0m
+for: 5m
 labels:
   severity: critical
 annotations:
diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py
index 719c44c..d30ffdc 100644
--- a/tests/integration/test_charm.py
+++ b/tests/integration/test_charm.py
@@ -197,6 +197,12 @@ async def test_seldon_alert_rules(ops_test: OpsTest):
     discovered_labels = targets_result["data"]["activeTargets"][0]["discoveredLabels"]
     assert discovered_labels["juju_application"] == "seldon-controller-manager"
 
+    # query for the up metric and assert the unit is available
+    up_query_response = await fetch_url(
+        f'http://{prometheus_url}:9090/api/v1/query?query=up{{juju_application="{APP_NAME}"}}'
+    )
+    assert up_query_response["data"]["result"][0]["value"][1] == "1"
+
     # obtain alert rules from Prometheus
     rules_url = f"http://{prometheus_url}:9090/api/v1/rules"
     alert_rules_result = await fetch_url(rules_url)
diff --git a/tests/unit/test_operator.py b/tests/unit/test_operator.py
index 146cc37..22f05f6 100644
--- a/tests/unit/test_operator.py
+++ b/tests/unit/test_operator.py
@@ -13,7 +13,7 @@
 from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus
 from ops.testing import Harness
 
-from charm import SeldonCoreOperator
+from charm import METRICS_PORT, SeldonCoreOperator
 
 SELDON_CM_NAME = "seldon-config"
 
@@ -181,7 +181,7 @@ def test_pebble_layer(
         assert (
             pebble_plan_info["services"]["seldon-core"]["command"] == "/manager "
             "--enable-leader-election "
-            f"--metrics-addr=:{harness.charm._metrics_port} "
+            f"--metrics-addr=:{METRICS_PORT} "
             f"--webhook-port {harness.charm._webhook_port} "
             f"--log-level={harness.charm._manager_log_level} "
             f"--leader-election-id={harness.charm._manager_leader_election_id} "