diff --git a/sky/cli.py b/sky/cli.py index 3be2ae374f5..6bc4075c260 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -207,15 +207,53 @@ def _interactive_node_cli_command(cli_func): default=False, required=False, help='Skip confirmation prompt.') + idle_autostop = click.option('--idle-minutes-to-autostop', + '-i', + default=None, + type=int, + required=False, + help=('Automatically stop the cluster after ' + 'this many minutes of idleness, i.e. ' + 'no running or pending jobs in the ' + 'cluster\'s job queue. Idleness starts ' + 'counting after setup/file_mounts are ' + 'done; the clock gets reset whenever ' + 'there are running/pending jobs in the ' + 'job queue. If not set, the cluster ' + 'will not be auto-stopped.')) + retry_until_up = click.option('--retry-until-up', + '-r', + is_flag=True, + default=False, + required=False, + help=('Whether to retry provisioning ' + 'infinitely until the cluster is up ' + 'if we fail to launch the cluster on ' + 'any possible region/cloud due to ' + 'unavailability errors.')) + region_option = click.option('--region', + default=None, + type=str, + required=False, + help='The region to use.') + zone_option = click.option('--zone', + default=None, + type=str, + required=False, + help='The zone to use.') click_decorators = [ cli.command(cls=_DocumentedCodeCommand), cluster_option, no_confirm, port_forward_option, + idle_autostop, + retry_until_up, # Resource options *([cloud_option] if cli_func.__name__ != 'tpunode' else []), + region_option, + zone_option, instance_type_option, *([gpus] if cli_func.__name__ == 'gpunode' else []), *([tpus] if cli_func.__name__ == 'tpunode' else []), @@ -669,6 +707,8 @@ def _create_and_ssh_into_node( session_manager: Optional[str] = None, user_requested_resources: Optional[bool] = False, no_confirm: bool = False, + idle_minutes_to_autostop: Optional[int] = None, + retry_until_up: bool = False, ): """Creates and attaches to an interactive node. @@ -681,6 +721,14 @@ def _create_and_ssh_into_node( session_manager: Attach session manager: { 'screen', 'tmux' }. user_requested_resources: If true, user requested resources explicitly. no_confirm: If true, skips confirmation prompt presented to user. + idle_minutes_to_autostop: Automatically stop the cluster after + specified minutes of idleness. Idleness + starts counting after setup/file_mounts are + done; the clock gets reset whenever there + are running/pending jobs in the job queue. + retry_until_up: Whether to retry provisioning infinitely until the + cluster is up if we fail to launch due to + unavailability errors. """ assert node_type in _INTERACTIVE_NODE_TYPES, node_type assert session_manager in (None, 'screen', 'tmux'), session_manager @@ -719,6 +767,8 @@ def _create_and_ssh_into_node( dryrun=False, detach_run=True, no_confirm=no_confirm, + idle_minutes_to_autostop=idle_minutes_to_autostop, + retry_until_up=retry_until_up, node_type=node_type, ) handle = global_user_state.get_handle_from_cluster_name(cluster_name) @@ -1957,10 +2007,11 @@ def _terminate_or_stop(name: str): @usage_lib.entrypoint # pylint: disable=redefined-outer-name def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], - cloud: Optional[str], instance_type: Optional[str], - gpus: Optional[str], use_spot: Optional[bool], - screen: Optional[bool], tmux: Optional[bool], - disk_size: Optional[int]): + cloud: Optional[str], region: Optional[str], zone: Optional[str], + instance_type: Optional[str], gpus: Optional[str], + use_spot: Optional[bool], screen: Optional[bool], + tmux: Optional[bool], disk_size: Optional[int], + idle_minutes_to_autostop: Optional[int], retry_until_up: bool): """Launch or attach to an interactive GPU node. Examples: @@ -2008,6 +2059,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], if use_spot is None: use_spot = default_resources.use_spot resources = sky.Resources(cloud=cloud_provider, + region=region, + zone=zone, instance_type=instance_type, accelerators=gpus, use_spot=use_spot, @@ -2021,6 +2074,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], session_manager=session_manager, user_requested_resources=user_requested_resources, no_confirm=yes, + idle_minutes_to_autostop=idle_minutes_to_autostop, + retry_until_up=retry_until_up, ) @@ -2028,9 +2083,11 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], @usage_lib.entrypoint # pylint: disable=redefined-outer-name def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], - cloud: Optional[str], instance_type: Optional[str], - use_spot: Optional[bool], screen: Optional[bool], - tmux: Optional[bool], disk_size: Optional[int]): + cloud: Optional[str], region: Optional[str], zone: Optional[str], + instance_type: Optional[str], use_spot: Optional[bool], + screen: Optional[bool], tmux: Optional[bool], + disk_size: Optional[int], idle_minutes_to_autostop: Optional[int], + retry_until_up: bool): """Launch or attach to an interactive CPU node. Examples: @@ -2075,6 +2132,8 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], if use_spot is None: use_spot = default_resources.use_spot resources = sky.Resources(cloud=cloud_provider, + region=region, + zone=zone, instance_type=instance_type, use_spot=use_spot, disk_size=disk_size) @@ -2087,6 +2146,8 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], session_manager=session_manager, user_requested_resources=user_requested_resources, no_confirm=yes, + idle_minutes_to_autostop=idle_minutes_to_autostop, + retry_until_up=retry_until_up, ) @@ -2094,10 +2155,12 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], @usage_lib.entrypoint # pylint: disable=redefined-outer-name def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], + region: Optional[str], zone: Optional[str], instance_type: Optional[str], tpus: Optional[str], use_spot: Optional[bool], tpu_vm: Optional[bool], screen: Optional[bool], tmux: Optional[bool], - disk_size: Optional[int]): + disk_size: Optional[int], idle_minutes_to_autostop: Optional[int], + retry_until_up: bool): """Launch or attach to an interactive TPU node. Examples: @@ -2147,6 +2210,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], if use_spot is None: use_spot = default_resources.use_spot resources = sky.Resources(cloud=sky.GCP(), + region=region, + zone=zone, instance_type=instance_type, accelerators=tpus, accelerator_args=accelerator_args, @@ -2161,6 +2226,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]], session_manager=session_manager, user_requested_resources=user_requested_resources, no_confirm=yes, + idle_minutes_to_autostop=idle_minutes_to_autostop, + retry_until_up=retry_until_up, )