From 3170fa4992b06436a9dc32e2247a194fa0ec4541 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev Date: Fri, 22 Apr 2022 17:33:24 +0300 Subject: [PATCH] [PLAT-3761] Completely remove pipes from all health checks remote commands Summary: It turns put that shell pipes are causing timeouts in case they're executed remotely over ssh. Was not able to find what's actually causing it (as it's not reproducible locally or on any of our portals) - hence removing pipes completely from health check scripts. Test Plan: Check that health checks are working as expected and provide reasonable metric values in output. Reviewers: spotachev, nsingh Reviewed By: nsingh Subscribers: jenkins-bot, yugaware Differential Revision: https://phabricator.dev.yugabyte.com/D16632 --- managed/devops/bin/cluster_health.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/managed/devops/bin/cluster_health.py b/managed/devops/bin/cluster_health.py index 55d4031dda8..51c0c248817 100755 --- a/managed/devops/bin/cluster_health.py +++ b/managed/devops/bin/cluster_health.py @@ -520,7 +520,7 @@ def check_tserver_yb_version(self, process): self.node, process, self.tserver_http_port, self.universe_version) def check_logs_find_output(self, output): - logs = [] + log_tuples = [] if output: for line in output.strip().split('\n'): splits = line.strip().split() @@ -533,10 +533,11 @@ def check_logs_find_output(self, output): epoch = epoch.split('.')[0] if not epoch.isdigit(): continue - logs.append('{} ({} old)'.format( - filename, - ''.join(seconds_to_human_readable_time(int(time.time() - int(epoch)))))) - return logs + log_tuples.append((epoch, filename, + seconds_to_human_readable_time(int(time.time() - int(epoch))))) + + sorted_logs = sorted(log_tuples, key=lambda log: log[0], reverse=True) + return list(map(lambda log: '{} ({} old)'.format(log[1], log[2]), sorted_logs)) def check_for_error_logs(self, process): logging.info("Checking for error logs on node {}".format(self.node)) @@ -547,7 +548,7 @@ def check_for_error_logs(self, process): metric_value = 0 for log_severity in ["FATAL", "ERROR"]: - remote_cmd = ('find {} {} -name "*{}*" -type f -printf "%T@ %p\\n" | sort -rn'.format( + remote_cmd = ('find {} {} -name "*{}*" -type f -printf "%T@ %p\\n"'.format( search_dir, '-mmin -{}'.format(FATAL_TIME_THRESHOLD_MINUTES), log_severity)) @@ -636,13 +637,13 @@ def check_file_descriptors(self): logging.info("Checking for open file descriptors on node {}".format(self.node)) e = self._new_entry("Opened file descriptors") - remote_cmd = 'ulimit -n; cat /proc/sys/fs/file-max; cat /proc/sys/fs/file-nr | cut -f1' + remote_cmd = 'ulimit -n; cat /proc/sys/fs/file-max; awk \'{print $1}\' /proc/sys/fs/file-nr' output = self._remote_check_output(remote_cmd) if has_errors(output): return e.fill_and_return_entry([output], True) - counts = output.split() + counts = output.split('\n') if len(counts) != 3: return e.fill_and_return_entry(