Skip to content

Commit

Permalink
[PLAT-3761] Completely remove pipes from all health checks remote com…
Browse files Browse the repository at this point in the history
…mands

Summary:
It turns put that shell pipes are causing timeouts in case they're executed remotely over ssh.
Was not able to find what's actually causing it (as it's not reproducible locally or on any of our portals) - hence removing pipes completely from health check scripts.

Test Plan: Check that health checks are working as expected and provide reasonable metric values in output.

Reviewers: spotachev, nsingh

Reviewed By: nsingh

Subscribers: jenkins-bot, yugaware

Differential Revision: https://phabricator.dev.yugabyte.com/D16632
  • Loading branch information
anmalysh-yb committed Apr 22, 2022
1 parent 5ce1473 commit 3170fa4
Showing 1 changed file with 9 additions and 8 deletions.
17 changes: 9 additions & 8 deletions managed/devops/bin/cluster_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ def check_tserver_yb_version(self, process):
self.node, process, self.tserver_http_port, self.universe_version)

def check_logs_find_output(self, output):
logs = []
log_tuples = []
if output:
for line in output.strip().split('\n'):
splits = line.strip().split()
Expand All @@ -533,10 +533,11 @@ def check_logs_find_output(self, output):
epoch = epoch.split('.')[0]
if not epoch.isdigit():
continue
logs.append('{} ({} old)'.format(
filename,
''.join(seconds_to_human_readable_time(int(time.time() - int(epoch))))))
return logs
log_tuples.append((epoch, filename,
seconds_to_human_readable_time(int(time.time() - int(epoch)))))

sorted_logs = sorted(log_tuples, key=lambda log: log[0], reverse=True)
return list(map(lambda log: '{} ({} old)'.format(log[1], log[2]), sorted_logs))

def check_for_error_logs(self, process):
logging.info("Checking for error logs on node {}".format(self.node))
Expand All @@ -547,7 +548,7 @@ def check_for_error_logs(self, process):

metric_value = 0
for log_severity in ["FATAL", "ERROR"]:
remote_cmd = ('find {} {} -name "*{}*" -type f -printf "%T@ %p\\n" | sort -rn'.format(
remote_cmd = ('find {} {} -name "*{}*" -type f -printf "%T@ %p\\n"'.format(
search_dir,
'-mmin -{}'.format(FATAL_TIME_THRESHOLD_MINUTES),
log_severity))
Expand Down Expand Up @@ -636,13 +637,13 @@ def check_file_descriptors(self):
logging.info("Checking for open file descriptors on node {}".format(self.node))
e = self._new_entry("Opened file descriptors")

remote_cmd = 'ulimit -n; cat /proc/sys/fs/file-max; cat /proc/sys/fs/file-nr | cut -f1'
remote_cmd = 'ulimit -n; cat /proc/sys/fs/file-max; awk \'{print $1}\' /proc/sys/fs/file-nr'
output = self._remote_check_output(remote_cmd)

if has_errors(output):
return e.fill_and_return_entry([output], True)

counts = output.split()
counts = output.split('\n')

if len(counts) != 3:
return e.fill_and_return_entry(
Expand Down

0 comments on commit 3170fa4

Please sign in to comment.