Skip to content

Commit

Permalink
fix: correct metrics path for MetricsEndpointProvider (#236) (#240)
Browse files Browse the repository at this point in the history
* fix: correctly configure one scrape job to avoid firig alerts

The metrics endpoint configuration had two scrape jobs, one for the
regular metrics endpoint, and a second one based on a dynamic list of
targets. The latter was causing the prometheus scraper to try and scrape
metrics from *:80/metrics, which is not a valid endpoint. This was
causing the UnitsUnavailable alert to fire constantly because that job
was reporting back that the endpoint was not available.
This new job was introduced by #94
with no apparent justification. Because the seldon charm has changed
since that PR, and the endpoint it is configuring is not valid, this
commit will remove the extra job.

This commit also refactors the MetricsEndpointProvider instantiation and
removes the metrics-port config option as this value should not change.

Finally, this commit changes the alert rule interval from 0m to 5m, as
this interval is more appropriate for production environments.

Part of canonical/bundle-kubeflow#564

* tests: add an assertion for checking unit is available

The test_prometheus_grafana_integration test case was doing queries to prometheus
and checking the request returned successfully and that the application name and model
was listed correctly. To make this test case more accurately, we can add an assertion that
also checks that the unit is available, this way we avoid issues like the one described in
canonical/bundle-kubeflow#564.

Part of canonical/bundle-kubeflow#564
  • Loading branch information
DnPlas authored Feb 14, 2024
1 parent c9fdaf7 commit 8631493
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 25 deletions.
4 changes: 0 additions & 4 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
options:
metrics-port:
type: string
default: '8080'
description: Metrics port
webhook-port:
type: string
default: '4443'
Expand Down
36 changes: 18 additions & 18 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
"configmap__predictor__tempo_server__v2",
]
DEFAULT_IMAGES_FILE = "src/default-custom-images.json"
METRICS_PATH = "/metrics"
METRICS_PORT = "8080"

with open(DEFAULT_IMAGES_FILE, "r") as json_file:
DEFAULT_IMAGES = json.load(json_file)
Expand All @@ -84,7 +86,6 @@ def __init__(self, *args):
self._namespace = self.model.name
self._lightkube_field_manager = "lightkube"
self._name = self.model.app.name
self._metrics_port = self.model.config["metrics-port"]
self._webhook_port = self.model.config["webhook-port"]
self._manager_create_resources = self.model.config["manager-create-resources"]
self._manager_log_level = self.model.config["manager-log-level"]
Expand All @@ -104,7 +105,7 @@ def __init__(self, *args):
self._exec_command = (
"/manager "
"--enable-leader-election "
f"--metrics-addr=:{self._metrics_port} "
f"--metrics-addr=:{METRICS_PORT} "
f"--webhook-port {self._webhook_port} "
f"--log-level={self._manager_log_level} "
f"--leader-election-id={self._manager_leader_election_id} "
Expand All @@ -131,37 +132,24 @@ def __init__(self, *args):
self._crd_resource_handler = None
self._configmap_resource_handler = None

metrics_port = ServicePort(int(self._metrics_port), name="metrics-port")
metrics_port = ServicePort(int(METRICS_PORT), name="metrics-port")
webhook_port = ServicePort(int(self._webhook_port), name="webhook-port")
self.service_patcher = KubernetesServicePatch(
self,
[metrics_port, webhook_port],
service_name=f"{self.model.app.name}",
)

# setup events to be handled by main event handler
self.framework.observe(self.on.config_changed, self._on_event)
for rel in self.model.relations.keys():
self.framework.observe(self.on[rel].relation_changed, self._on_event)

# setup events to be handled by specific event handlers
self.framework.observe(self.on.install, self._on_install)
self.framework.observe(self.on.upgrade_charm, self._on_upgrade)
self.framework.observe(self.on.seldon_core_pebble_ready, self._on_pebble_ready)
self.framework.observe(self.on.remove, self._on_remove)
self.framework.observe(self.on.stop, self._on_stop)

# Prometheus related config
self.prometheus_provider = MetricsEndpointProvider(
charm=self,
relation_name="metrics-endpoint",
jobs=[
{
"metrics_path": self.config["executor-server-metrics-port-name"],
"static_configs": [{"targets": ["*:{}".format(self.config["metrics-port"])]}],
"metrics_path": METRICS_PATH,
"static_configs": [{"targets": ["*:{}".format(METRICS_PORT)]}],
}
],
lookaside_jobs_callable=self.return_list_of_running_models,
)

# Dashboard related config (Grafana)
Expand All @@ -170,6 +158,18 @@ def __init__(self, *args):
relation_name="grafana-dashboard",
)

# setup events to be handled by main event handler
self.framework.observe(self.on.config_changed, self._on_event)
for rel in self.model.relations.keys():
self.framework.observe(self.on[rel].relation_changed, self._on_event)

# setup events to be handled by specific event handlers
self.framework.observe(self.on.install, self._on_install)
self.framework.observe(self.on.upgrade_charm, self._on_upgrade)
self.framework.observe(self.on.seldon_core_pebble_ready, self._on_pebble_ready)
self.framework.observe(self.on.remove, self._on_remove)
self.framework.observe(self.on.stop, self._on_stop)

@property
def container(self):
"""Return container."""
Expand Down
2 changes: 1 addition & 1 deletion src/prometheus_alert_rules/unit_unavailable.rule
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
alert: SeldonUnitIsUnavailable
expr: up < 1
for: 0m
for: 5m
labels:
severity: critical
annotations:
Expand Down
6 changes: 6 additions & 0 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,12 @@ async def test_seldon_alert_rules(ops_test: OpsTest):
discovered_labels = targets_result["data"]["activeTargets"][0]["discoveredLabels"]
assert discovered_labels["juju_application"] == "seldon-controller-manager"

# query for the up metric and assert the unit is available
up_query_response = await fetch_url(
f'http://{prometheus_url}:9090/api/v1/query?query=up{{juju_application="{APP_NAME}"}}'
)
assert up_query_response["data"]["result"][0]["value"][1] == "1"

# obtain alert rules from Prometheus
rules_url = f"http://{prometheus_url}:9090/api/v1/rules"
alert_rules_result = await fetch_url(rules_url)
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus
from ops.testing import Harness

from charm import SeldonCoreOperator
from charm import METRICS_PORT, SeldonCoreOperator

SELDON_CM_NAME = "seldon-config"

Expand Down Expand Up @@ -181,7 +181,7 @@ def test_pebble_layer(
assert (
pebble_plan_info["services"]["seldon-core"]["command"] == "/manager "
"--enable-leader-election "
f"--metrics-addr=:{harness.charm._metrics_port} "
f"--metrics-addr=:{METRICS_PORT} "
f"--webhook-port {harness.charm._webhook_port} "
f"--log-level={harness.charm._manager_log_level} "
f"--leader-election-id={harness.charm._manager_leader_election_id} "
Expand Down

0 comments on commit 8631493

Please sign in to comment.