Skip to content

Commit

Permalink
Add --retry-until-up, --region, --zone, and `--idle-minutes-to-…
Browse files Browse the repository at this point in the history
…autostop` for interactive nodes (#1207)

* Add --retry-until-up flag for interactive nodes

* Add --region flag for interactive nodes

* Add --idle-minutes-to-autostop flag for interactive nodes

* Add --zone flag for interactive nodes

* Update help messages

* Address nit
  • Loading branch information
ewzeng authored Oct 10, 2022
1 parent bd4f929 commit f06416d
Showing 1 changed file with 75 additions and 8 deletions.
83 changes: 75 additions & 8 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,53 @@ def _interactive_node_cli_command(cli_func):
default=False,
required=False,
help='Skip confirmation prompt.')
idle_autostop = click.option('--idle-minutes-to-autostop',
'-i',
default=None,
type=int,
required=False,
help=('Automatically stop the cluster after '
'this many minutes of idleness, i.e. '
'no running or pending jobs in the '
'cluster\'s job queue. Idleness starts '
'counting after setup/file_mounts are '
'done; the clock gets reset whenever '
'there are running/pending jobs in the '
'job queue. If not set, the cluster '
'will not be auto-stopped.'))
retry_until_up = click.option('--retry-until-up',
'-r',
is_flag=True,
default=False,
required=False,
help=('Whether to retry provisioning '
'infinitely until the cluster is up '
'if we fail to launch the cluster on '
'any possible region/cloud due to '
'unavailability errors.'))
region_option = click.option('--region',
default=None,
type=str,
required=False,
help='The region to use.')
zone_option = click.option('--zone',
default=None,
type=str,
required=False,
help='The zone to use.')

click_decorators = [
cli.command(cls=_DocumentedCodeCommand),
cluster_option,
no_confirm,
port_forward_option,
idle_autostop,
retry_until_up,

# Resource options
*([cloud_option] if cli_func.__name__ != 'tpunode' else []),
region_option,
zone_option,
instance_type_option,
*([gpus] if cli_func.__name__ == 'gpunode' else []),
*([tpus] if cli_func.__name__ == 'tpunode' else []),
Expand Down Expand Up @@ -669,6 +707,8 @@ def _create_and_ssh_into_node(
session_manager: Optional[str] = None,
user_requested_resources: Optional[bool] = False,
no_confirm: bool = False,
idle_minutes_to_autostop: Optional[int] = None,
retry_until_up: bool = False,
):
"""Creates and attaches to an interactive node.
Expand All @@ -681,6 +721,14 @@ def _create_and_ssh_into_node(
session_manager: Attach session manager: { 'screen', 'tmux' }.
user_requested_resources: If true, user requested resources explicitly.
no_confirm: If true, skips confirmation prompt presented to user.
idle_minutes_to_autostop: Automatically stop the cluster after
specified minutes of idleness. Idleness
starts counting after setup/file_mounts are
done; the clock gets reset whenever there
are running/pending jobs in the job queue.
retry_until_up: Whether to retry provisioning infinitely until the
cluster is up if we fail to launch due to
unavailability errors.
"""
assert node_type in _INTERACTIVE_NODE_TYPES, node_type
assert session_manager in (None, 'screen', 'tmux'), session_manager
Expand Down Expand Up @@ -719,6 +767,8 @@ def _create_and_ssh_into_node(
dryrun=False,
detach_run=True,
no_confirm=no_confirm,
idle_minutes_to_autostop=idle_minutes_to_autostop,
retry_until_up=retry_until_up,
node_type=node_type,
)
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
Expand Down Expand Up @@ -1957,10 +2007,11 @@ def _terminate_or_stop(name: str):
@usage_lib.entrypoint
# pylint: disable=redefined-outer-name
def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], instance_type: Optional[str],
gpus: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int]):
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], gpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], retry_until_up: bool):
"""Launch or attach to an interactive GPU node.
Examples:
Expand Down Expand Up @@ -2008,6 +2059,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
if use_spot is None:
use_spot = default_resources.use_spot
resources = sky.Resources(cloud=cloud_provider,
region=region,
zone=zone,
instance_type=instance_type,
accelerators=gpus,
use_spot=use_spot,
Expand All @@ -2021,16 +2074,20 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
session_manager=session_manager,
user_requested_resources=user_requested_resources,
no_confirm=yes,
idle_minutes_to_autostop=idle_minutes_to_autostop,
retry_until_up=retry_until_up,
)


@_interactive_node_cli_command
@usage_lib.entrypoint
# pylint: disable=redefined-outer-name
def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], instance_type: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int]):
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
retry_until_up: bool):
"""Launch or attach to an interactive CPU node.
Examples:
Expand Down Expand Up @@ -2075,6 +2132,8 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
if use_spot is None:
use_spot = default_resources.use_spot
resources = sky.Resources(cloud=cloud_provider,
region=region,
zone=zone,
instance_type=instance_type,
use_spot=use_spot,
disk_size=disk_size)
Expand All @@ -2087,17 +2146,21 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
session_manager=session_manager,
user_requested_resources=user_requested_resources,
no_confirm=yes,
idle_minutes_to_autostop=idle_minutes_to_autostop,
retry_until_up=retry_until_up,
)


@_interactive_node_cli_command
@usage_lib.entrypoint
# pylint: disable=redefined-outer-name
def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region: Optional[str], zone: Optional[str],
instance_type: Optional[str], tpus: Optional[str],
use_spot: Optional[bool], tpu_vm: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int]):
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
retry_until_up: bool):
"""Launch or attach to an interactive TPU node.
Examples:
Expand Down Expand Up @@ -2147,6 +2210,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
if use_spot is None:
use_spot = default_resources.use_spot
resources = sky.Resources(cloud=sky.GCP(),
region=region,
zone=zone,
instance_type=instance_type,
accelerators=tpus,
accelerator_args=accelerator_args,
Expand All @@ -2161,6 +2226,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
session_manager=session_manager,
user_requested_resources=user_requested_resources,
no_confirm=yes,
idle_minutes_to_autostop=idle_minutes_to_autostop,
retry_until_up=retry_until_up,
)


Expand Down

0 comments on commit f06416d

Please sign in to comment.