Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Autoscaler] CLI Logger docs #9690

Merged
merged 16 commits into from
Jul 24, 2020
Prev Previous commit
Next Next commit
Update ray down as well
  • Loading branch information
maximsmol committed Jul 6, 2020
commit c01199dd1d273d7f2dab7986c954e693ac7ed9e5
30 changes: 23 additions & 7 deletions python/ray/autoscaler/aws/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def terminate_node(self, node_id):
if self.cache_stopped_nodes:
if node.spot_instance_request_id:
cli_logger.print(
"Terminating node {} " +
"Terminating instance {} " +
cf.gray("(cannot stop spot instances, only terminate)"),
node_id) # todo: show node name?

Expand All @@ -388,11 +388,11 @@ def terminate_node(self, node_id):
node.terminate()
else:
cli_logger.print(
"Stopping node {} " +
"Stopping instance {} " +
cf.gray(
"(to terminate instead, "
"set `cache_stopped_nodes: False` "
"under `provider` in the cluster configuration )"),
"under `provider` in the cluster configuration)"),
node_id) # todo: show node name?

cli_logger.old_info(
Expand Down Expand Up @@ -421,15 +421,31 @@ def terminate_nodes(self, node_ids):
on_demand_ids += [node_id]

if on_demand_ids:
logger.info(
# todo: show node names?
cli_logger.print(
"Stopping instances {} " +
cf.gray(
"(to terminate instead, "
"set `cache_stopped_nodes: False` "
"under `provider` in the cluster configuration)"),
cli_logger.render_list(on_demand_ids))
cli_logger.old_info(
logger,
"AWSNodeProvider: stopping nodes {}. To terminate nodes "
"on stop, set 'cache_stopped_nodes: False' in the "
"provider config.".format(on_demand_ids))
"provider config.", on_demand_ids)

self.ec2.meta.client.stop_instances(InstanceIds=on_demand_ids)
if spot_ids:
logger.info(
cli_logger.print(
"Terminating instances {} " +
cf.gray("(cannot stop spot instances, only terminate)"),
cli_logger.render_list(spot_ids))
cli_logger.old_info(
logger,
"AWSNodeProvider: terminating nodes {} (spot nodes cannot "
"be stopped, only terminated)".format(spot_ids))
"be stopped, only terminated)", spot_ids)

self.ec2.meta.client.terminate_instances(InstanceIds=spot_ids)
else:
self.ec2.meta.client.terminate_instances(InstanceIds=node_ids)
Expand Down
38 changes: 23 additions & 15 deletions python/ray/autoscaler/cli_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def _format_msg(msg, *args, **kwargs):
class _CliLogger():
def __init__(self):
self.strip = False
self.old_style = False
self.old_style = True
self.color_mode = "auto"
self.indent_level = 0
self.verbosity = 0
Expand All @@ -84,9 +84,11 @@ def detect_colors(self):
if self.color_mode == "false":
self.strip = True
return
if self.color_mode == "auto":
self.strip = sys.stdout.isatty()
return

# todo: actually detect ttys here
self.strip = False
raise ValueError("Invalid log color setting: " + self.color_mode)

def newline(self):
self._print('')
Expand Down Expand Up @@ -212,6 +214,11 @@ def old_error(self, logger, msg, *args, **kwargs):
logger.error(_format_msg(msg, *args, **kwargs))
return

def old_exception(self, logger, msg, *args, **kwargs):
if self.old_style:
logger.exception(_format_msg(msg, *args, **kwargs))
return

def add_log_info(self, **kwargs):
for k, v in kwargs.items():
self.info[k] = v
Expand Down Expand Up @@ -263,27 +270,28 @@ def confirm(self, yes, msg, *args, **kwargs):
try:
while True:
ans = sys.stdin.readline()
ans = ans.strip()
ans = ans.lower()

if ans == "\n":
res = default
break
elif ans in yes_answers:

ans = ans.strip()
if ans in yes_answers:
res = True
break
elif ans in no_answers:
if ans in no_answers:
res = False
break
else:
indent = " " * l
self.error(
"{}Invalid answer: {}. "
"Expected {} or {}",
indent, cf.bold(ans.strip()),
self.render_list(yes_answers, "/"),
self.render_list(no_answers, "/"))
self._print(indent + confirm_str, linefeed=False)

indent = " " * l
self.error(
"{}Invalid answer: {}. "
"Expected {} or {}",
indent, cf.bold(ans.strip()),
self.render_list(yes_answers, "/"),
self.render_list(no_answers, "/"))
self._print(indent + confirm_str, linefeed=False)
except KeyboardInterrupt:
self.newline()
res = default
Expand Down
53 changes: 45 additions & 8 deletions python/ray/autoscaler/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,8 +351,12 @@ def _bootstrap_config(config):


def teardown_cluster(config_file, yes, workers_only, override_cluster_name,
keep_min_workers):
keep_min_workers, log_old_style, log_color, verbose):
"""Destroys all nodes of a Ray cluster described by a config json."""
cli_logger.old_style = log_old_style
cli_logger.color_mode = log_color
cli_logger.verbosity = verbose
cli_logger.dump_command_output = verbose == 3 # todo: add a separate flag?

config = yaml.safe_load(open(config_file).read())
if override_cluster_name is not None:
Expand All @@ -369,8 +373,16 @@ def teardown_cluster(config_file, yes, workers_only, override_cluster_name,
try:
exec_cluster(config_file, "ray stop", False, False, False, False,
False, override_cluster_name, None, False)
except Exception:
logger.exception("Ignoring error attempting a clean shutdown.")
except Exception as e:
cli_logger.warning( # todo: add exception info
"Exception occured when stopping the cluster Ray runtime.")
cli_logger.warning(
"Ignoring the exception and "
"attempting to shut down the cluster nodes anyway.")

cli_logger.old_exception(
logger,
"Ignoring error attempting a clean shutdown.")

provider = get_node_provider(config["provider"], config["cluster_name"])
try:
Expand All @@ -383,11 +395,24 @@ def remaining_nodes():

if keep_min_workers:
min_workers = config.get("min_workers", 0)
logger.info("teardown_cluster: "
"Keeping {} nodes...".format(min_workers))

cli_logger.print(
"{} random worker nodes will not be shut down. " +
cf.gray("(due to {})"),
cf.bold(min_workers), cf.bold("--keep-min-workers"))
cli_logger.old_info(
logger,
"teardown_cluster: Keeping {} nodes...", min_workers)

workers = random.sample(workers, len(workers) - min_workers)

# todo: it's weird to kill the head node but not all workers
if workers_only:
cli_logger.print(
"The head node will not be shut down. " +
cf.gray("(due to {})"),
cf.bold("--workers-only"))

return workers

head = provider.non_terminated_nodes({
Expand All @@ -401,11 +426,23 @@ def remaining_nodes():
A = remaining_nodes()
with LogTimer("teardown_cluster: done."):
while A:
logger.info("teardown_cluster: "
"Shutting down {} nodes...".format(len(A)))
cli_logger.old_info(
logger,
"teardown_cluster: "
"Shutting down {} nodes...", len(A))

provider.terminate_nodes(A)
time.sleep(1)

cli_logger.print(
"Requested {} nodes to shut down.",
cf.bold(len(A)),
_tags=dict(interval="1s"))

time.sleep(1) # todo: interval should be a variable
A = remaining_nodes()
cli_logger.print(
"{} nodes remaining after 1 second.",
cf.bold(len(A)))
finally:
provider.cleanup()

Expand Down
20 changes: 18 additions & 2 deletions python/ray/scripts/scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,20 @@ def create_or_update(cluster_config_file, min_workers, max_workers, no_restart,

@cli.command(hidden=True)
@click.argument("cluster_config_file", required=True, type=str)
@click.option("-v", "--verbose", count=True)
@click.option(
"--log-color",
required=False,
type=str,
default="auto",
help=(
"Use color logging. "
"Valid values are: auto (if stdout is a tty), true, false."))
@click.option(
"--log-old-style",
is_flag=True,
default=False,
help=("Use old logging."))
@click.option(
"--workers-only",
is_flag=True,
Expand All @@ -718,10 +732,12 @@ def create_or_update(cluster_config_file, min_workers, max_workers, no_restart,
type=str,
help="Override the configured cluster name.")
def teardown(cluster_config_file, yes, workers_only, cluster_name,
keep_min_workers):
keep_min_workers,
log_old_style, log_color, verbose):
"""Tear down a Ray cluster."""
teardown_cluster(cluster_config_file, yes, workers_only, cluster_name,
keep_min_workers)
keep_min_workers,
log_old_style, log_color, verbose)


@cli.command()
Expand Down