Skip to content

Commit

Permalink
[PLAT-15903][PLAT-15904] Clock drift health check fixes
Browse files Browse the repository at this point in the history
Summary:
First, the clockSkew health check now runs for on prem providers. Second clock
drift health check returns both the ntp service status and the current drift

Test Plan:
Validated check is run for on-prem providers
also validated drift check returns all errors

Reviewers: muthu, nsingh

Reviewed By: nsingh

Subscribers: yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D39899
  • Loading branch information
shubin-yb committed Nov 14, 2024
1 parent ad8384e commit 3b93e99
Showing 2 changed files with 15 additions and 12 deletions.
Original file line number Diff line number Diff line change
@@ -762,8 +762,9 @@ public void checkSingleUniverse(CheckSingleUniverseParams params) {
if (nodeInfo.enableYEDIS && nodeDetails.isRedisServer) {
nodeInfo.setRedisPort(nodeDetails.redisServerRpcPort);
}
if (!provider.getCode().equals(CloudType.onprem.toString())
&& !provider.getCode().equals(CloudType.kubernetes.toString())) {

// Skip clock check for k8s.
if (!provider.getCode().equals(CloudType.kubernetes.toString())) {
nodeInfo.setCheckClock(true);
}
// Clock drift config values. Clock drift health checks are only run for non-k8s universes
22 changes: 12 additions & 10 deletions managed/src/main/resources/health/node_health.py.template
Original file line number Diff line number Diff line change
@@ -1838,26 +1838,28 @@ class NodeChecker():
Metric.from_definition(YB_NODE_NTP_SERVICE_STATUS)
]
service_status = get_ntp_service_status()
service_error = service_status == 0
msgs = ["Ntp service is%s running" % " not" if service_error else ""]
metrics[1].add_value(service_status)
drift_ms = get_clock_drift_ms()
# Returns error string on failure, int on success
if isinstance(drift_ms, str):
return e.fill_and_return_entry([drift_ms], has_error=True, metrics=metrics)
msgs.append(drift_ms)
return e.fill_and_return_entry(msgs, has_error=True, metrics=metrics)
metrics[0].add_value(drift_ms)
errors = []
if drift_ms > self.time_drift_err_threshold:
errors.append("Node clock drift is {} ms, over {} ms".format(
msgs.append("Node clock drift is {} ms, over {} ms".format(
drift_ms, self.time_drift_err_threshold))
return e.fill_and_return_entry(errors, has_error=True, metrics=metrics)
return e.fill_and_return_entry(msgs, has_error=True, metrics=metrics)
if drift_ms > self.time_drift_wrn_threshold:
errors.append("Node clock drift is {} ms, over {} ms".format(
msgs.append("Node clock drift is {} ms, over {} ms".format(
drift_ms, self.time_drift_wrn_threshold))
return e.fill_and_return_warning_entry(errors, metrics=metrics)
if service_error:
return e.fill_and_return_entry(msgs, has_error=True, metrics=metrics)
return e.fill_and_return_warning_entry(msgs, metrics=metrics)

service_error = service_status == 0
return e.fill_and_return_entry(
["%s ms" % drift_ms, "ntp service is%s running" % " not" if service_error else ""],
has_error=service_error, metrics=metrics)
msgs.append("%s ms" % drift_ms)
return e.fill_and_return_entry(msgs, has_error=service_error, metrics=metrics)

def check_process_stats(self, process_name):
metrics = [Metric.from_definition(YB_PROCESS_CPU_SECONDS_TOTAL),

0 comments on commit 3b93e99

Please sign in to comment.