From 3b93e99e4fa51862aeb542807f80f949452ce8d6 Mon Sep 17 00:00:00 2001 From: Daniel Shubin Date: Sat, 9 Nov 2024 00:44:54 +0000 Subject: [PATCH] [PLAT-15903][PLAT-15904] Clock drift health check fixes Summary: First, the clockSkew health check now runs for on prem providers. Second clock drift health check returns both the ntp service status and the current drift Test Plan: Validated check is run for on-prem providers also validated drift check returns all errors Reviewers: muthu, nsingh Reviewed By: nsingh Subscribers: yugaware Differential Revision: https://phorge.dev.yugabyte.com/D39899 --- .../yw/commissioner/HealthChecker.java | 5 +++-- .../resources/health/node_health.py.template | 22 ++++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java b/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java index 7934738cbdb8..cba5785ca479 100644 --- a/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java +++ b/managed/src/main/java/com/yugabyte/yw/commissioner/HealthChecker.java @@ -762,8 +762,9 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) { if (nodeInfo.enableYEDIS && nodeDetails.isRedisServer) { nodeInfo.setRedisPort(nodeDetails.redisServerRpcPort); } - if (!provider.getCode().equals(CloudType.onprem.toString()) - && !provider.getCode().equals(CloudType.kubernetes.toString())) { + + // Skip clock check for k8s. + if (!provider.getCode().equals(CloudType.kubernetes.toString())) { nodeInfo.setCheckClock(true); } // Clock drift config values. Clock drift health checks are only run for non-k8s universes diff --git a/managed/src/main/resources/health/node_health.py.template b/managed/src/main/resources/health/node_health.py.template index 755f5dd65dbc..c663603e5868 100755 --- a/managed/src/main/resources/health/node_health.py.template +++ b/managed/src/main/resources/health/node_health.py.template @@ -1838,26 +1838,28 @@ class NodeChecker(): Metric.from_definition(YB_NODE_NTP_SERVICE_STATUS) ] service_status = get_ntp_service_status() + service_error = service_status == 0 + msgs = ["Ntp service is%s running" % " not" if service_error else ""] metrics[1].add_value(service_status) drift_ms = get_clock_drift_ms() # Returns error string on failure, int on success if isinstance(drift_ms, str): - return e.fill_and_return_entry([drift_ms], has_error=True, metrics=metrics) + msgs.append(drift_ms) + return e.fill_and_return_entry(msgs, has_error=True, metrics=metrics) metrics[0].add_value(drift_ms) - errors = [] if drift_ms > self.time_drift_err_threshold: - errors.append("Node clock drift is {} ms, over {} ms".format( + msgs.append("Node clock drift is {} ms, over {} ms".format( drift_ms, self.time_drift_err_threshold)) - return e.fill_and_return_entry(errors, has_error=True, metrics=metrics) + return e.fill_and_return_entry(msgs, has_error=True, metrics=metrics) if drift_ms > self.time_drift_wrn_threshold: - errors.append("Node clock drift is {} ms, over {} ms".format( + msgs.append("Node clock drift is {} ms, over {} ms".format( drift_ms, self.time_drift_wrn_threshold)) - return e.fill_and_return_warning_entry(errors, metrics=metrics) + if service_error: + return e.fill_and_return_entry(msgs, has_error=True, metrics=metrics) + return e.fill_and_return_warning_entry(msgs, metrics=metrics) - service_error = service_status == 0 - return e.fill_and_return_entry( - ["%s ms" % drift_ms, "ntp service is%s running" % " not" if service_error else ""], - has_error=service_error, metrics=metrics) + msgs.append("%s ms" % drift_ms) + return e.fill_and_return_entry(msgs, has_error=service_error, metrics=metrics) def check_process_stats(self, process_name): metrics = [Metric.from_definition(YB_PROCESS_CPU_SECONDS_TOTAL),