Skip to content

Commit

Permalink
[Resources] Add cpus in resource specification (skypilot-org#1622)
Browse files Browse the repository at this point in the history
  • Loading branch information
WoosukKwon authored and Sumanth committed Mar 15, 2023
1 parent 8172cb3 commit a50a357
Show file tree
Hide file tree
Showing 19 changed files with 510 additions and 144 deletions.
6 changes: 6 additions & 0 deletions docs/source/reference/yaml-spec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ Available fields:
# Format: <name>:<count> (or simply <name>, short for a count of 1).
accelerators: V100:4
# Number of vCPUs per node (optional).
#
# Format: <count> (exactly <count> vCPUs) or <count>+
# (at least <count> vCPUs).
cpus: 32
# Instance type to use (optional). If 'accelerators' is specified,
# the corresponding instance type is automatically inferred.
instance_type: p3.8xlarge
Expand Down
6 changes: 3 additions & 3 deletions examples/example_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def make_application():
sky.Resources(sky.AWS(), 'p3.2xlarge'), # 1 V100, EC2.
sky.Resources(sky.AWS(), 'p3.8xlarge'), # 4 V100s, EC2.
# Tuples mean all resources are required.
sky.Resources(sky.GCP(), 'n1-standard-8', 'tpu-v3-8'),
sky.Resources(sky.GCP(), 'n1-standard-8', accelerators='tpu-v3-8'),
})

train_op.set_time_estimator(time_estimators.resnet50_estimate_runtime)
Expand All @@ -60,8 +60,8 @@ def make_application():
infer_op.set_resources({
sky.Resources(sky.AWS(), 'inf1.2xlarge'),
sky.Resources(sky.AWS(), 'p3.2xlarge'),
sky.Resources(sky.GCP(), 'n1-standard-4', 'T4'),
sky.Resources(sky.GCP(), 'n1-standard-8', 'T4'),
sky.Resources(sky.GCP(), 'n1-standard-4', accelerators='T4'),
sky.Resources(sky.GCP(), 'n1-standard-8', accelerators='T4'),
})

infer_op.set_time_estimator(
Expand Down
77 changes: 59 additions & 18 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ def _interactive_node_cli_command(cli_func):
default=None,
type=str,
help='Instance type to use.')
cpus = click.option(
'--cpus',
default=None,
type=str,
help=('Number of vCPUs each instance must have '
'(e.g., ``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
gpus = click.option('--gpus',
default=None,
type=str,
Expand Down Expand Up @@ -268,6 +275,7 @@ def _interactive_node_cli_command(cli_func):
region_option,
zone_option,
instance_type_option,
cpus,
*([gpus] if cli_func.__name__ == 'gpunode' else []),
*([tpus] if cli_func.__name__ == 'tpunode' else []),
spot_option,
Expand Down Expand Up @@ -556,6 +564,7 @@ def _parse_override_params(cloud: Optional[str] = None,
region: Optional[str] = None,
zone: Optional[str] = None,
gpus: Optional[str] = None,
cpus: Optional[str] = None,
instance_type: Optional[str] = None,
use_spot: Optional[bool] = None,
image_id: Optional[str] = None,
Expand All @@ -582,6 +591,11 @@ def _parse_override_params(cloud: Optional[str] = None,
override_params['accelerators'] = None
else:
override_params['accelerators'] = gpus
if cpus is not None:
if cpus.lower() == 'none':
override_params['cpus'] = None
else:
override_params['cpus'] = cpus
if instance_type is not None:
if instance_type.lower() == 'none':
override_params['instance_type'] = None
Expand Down Expand Up @@ -908,6 +922,7 @@ def _make_task_from_entrypoint_with_overrides(
region: Optional[str] = None,
zone: Optional[str] = None,
gpus: Optional[str] = None,
cpus: Optional[str] = None,
instance_type: Optional[str] = None,
num_nodes: Optional[int] = None,
use_spot: Optional[bool] = None,
Expand Down Expand Up @@ -949,6 +964,7 @@ def _make_task_from_entrypoint_with_overrides(
region=region,
zone=zone,
gpus=gpus,
cpus=cpus,
instance_type=instance_type,
use_spot=use_spot,
image_id=image_id,
Expand Down Expand Up @@ -1090,6 +1106,13 @@ def cli():
default=False,
help='If used, runs locally inside a docker container.')
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
@click.option('--cpus',
default=None,
type=str,
required=False,
help=('Number of vCPUs each instance must have (e.g., '
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
@click.option('--disk-size',
default=None,
type=int,
Expand Down Expand Up @@ -1154,6 +1177,7 @@ def launch(
region: Optional[str],
zone: Optional[str],
gpus: Optional[str],
cpus: Optional[str],
instance_type: Optional[str],
num_nodes: Optional[int],
use_spot: Optional[bool],
Expand Down Expand Up @@ -1198,6 +1222,7 @@ def launch(
region=region,
zone=zone,
gpus=gpus,
cpus=cpus,
instance_type=instance_type,
num_nodes=num_nodes,
use_spot=use_spot,
Expand Down Expand Up @@ -1343,6 +1368,7 @@ def exec(
region=region,
zone=zone,
gpus=gpus,
cpus=None,
instance_type=instance_type,
use_spot=use_spot,
image_id=image_id,
Expand Down Expand Up @@ -2416,11 +2442,11 @@ def _down_or_stop(name: str):
# pylint: disable=redefined-outer-name
def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], gpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
instance_type: Optional[str], cpus: Optional[str],
gpus: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
"""Launch or attach to an interactive GPU node.
Examples:
Expand Down Expand Up @@ -2459,7 +2485,8 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],

user_requested_resources = not (cloud is None and region is None and
zone is None and instance_type is None and
gpus is None and use_spot is None)
cpus is None and gpus is None and
use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['gpunode']
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
if gpus is None and instance_type is None:
Expand All @@ -2472,6 +2499,7 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region=region,
zone=zone,
instance_type=instance_type,
cpus=cpus,
accelerators=gpus,
use_spot=use_spot,
disk_size=disk_size)
Expand All @@ -2495,10 +2523,11 @@ def gpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
# pylint: disable=redefined-outer-name
def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
cloud: Optional[str], region: Optional[str], zone: Optional[str],
instance_type: Optional[str], use_spot: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
instance_type: Optional[str], cpus: Optional[str],
use_spot: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
"""Launch or attach to an interactive CPU node.
Examples:
Expand Down Expand Up @@ -2536,7 +2565,7 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],

user_requested_resources = not (cloud is None and region is None and
zone is None and instance_type is None and
use_spot is None)
cpus is None and use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['cpunode']
cloud_provider = clouds.CLOUD_REGISTRY.from_str(cloud)
if instance_type is None:
Expand All @@ -2547,6 +2576,7 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region=region,
zone=zone,
instance_type=instance_type,
cpus=cpus,
use_spot=use_spot,
disk_size=disk_size)

Expand All @@ -2569,11 +2599,12 @@ def cpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
# pylint: disable=redefined-outer-name
def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region: Optional[str], zone: Optional[str],
instance_type: Optional[str], tpus: Optional[str],
use_spot: Optional[bool], tpu_vm: Optional[bool],
screen: Optional[bool], tmux: Optional[bool],
disk_size: Optional[int], idle_minutes_to_autostop: Optional[int],
down: bool, retry_until_up: bool):
instance_type: Optional[str], cpus: Optional[str],
tpus: Optional[str], use_spot: Optional[bool],
tpu_vm: Optional[bool], screen: Optional[bool],
tmux: Optional[bool], disk_size: Optional[int],
idle_minutes_to_autostop: Optional[int], down: bool,
retry_until_up: bool):
"""Launch or attach to an interactive TPU node.
Examples:
Expand Down Expand Up @@ -2610,8 +2641,8 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
name = _default_interactive_node_name('tpunode')

user_requested_resources = not (region is None and zone is None and
instance_type is None and tpus is None and
use_spot is None)
instance_type is None and cpus is None and
tpus is None and use_spot is None)
default_resources = _INTERACTIVE_NODE_DEFAULT_RESOURCES['tpunode']
accelerator_args = default_resources.accelerator_args
if tpu_vm:
Expand All @@ -2627,6 +2658,7 @@ def tpunode(cluster: str, yes: bool, port_forward: Optional[List[int]],
region=region,
zone=zone,
instance_type=instance_type,
cpus=cpus,
accelerators=tpus,
accelerator_args=accelerator_args,
use_spot=use_spot,
Expand Down Expand Up @@ -2969,6 +3001,13 @@ def spot():
**_get_shell_complete_args(_complete_file_name))
# TODO(zhwu): Add --dryrun option to test the launch command.
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
@click.option('--cpus',
default=None,
type=str,
required=False,
help=('Number of vCPUs each instance must have (e.g., '
'``--cpus=4`` (exactly 4) or ``--cpus=4+`` (at least 4)). '
'This is used to automatically select the instance type.'))
@click.option('--spot-recovery',
default=None,
type=str,
Expand Down Expand Up @@ -3011,6 +3050,7 @@ def spot_launch(
region: Optional[str],
zone: Optional[str],
gpus: Optional[str],
cpus: Optional[str],
instance_type: Optional[str],
num_nodes: Optional[int],
use_spot: Optional[bool],
Expand Down Expand Up @@ -3049,6 +3089,7 @@ def spot_launch(
region=region,
zone=zone,
gpus=gpus,
cpus=cpus,
instance_type=instance_type,
num_nodes=num_nodes,
use_spot=use_spot,
Expand Down
23 changes: 14 additions & 9 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,10 @@ def is_same_cloud(self, other: clouds.Cloud):
return isinstance(other, AWS)

@classmethod
def get_default_instance_type(cls) -> str:
# General-purpose instance with 8 vCPUs and 32 GB RAM.
# Intel Ice Lake 8375C
return 'm6i.2xlarge'
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
return service_catalog.get_default_instance_type(cpus=cpus,
clouds='aws')

# TODO: factor the following three methods, as they are the same logic
# between Azure and AWS.
Expand Down Expand Up @@ -334,12 +334,11 @@ def make_deploy_resources_variables(

def get_feasible_launchable_resources(self,
resources: 'resources_lib.Resources'):
fuzzy_candidate_list: List[str] = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
return ([resources], [])

def _make(instance_list):
resource_list = []
Expand All @@ -350,16 +349,21 @@ def _make(instance_list):
# Setting this to None as AWS doesn't separately bill /
# attach the accelerators. Billed as part of the VM type.
accelerators=None,
cpus=None,
)
resource_list.append(r)
return resource_list

# Currently, handle a filter on accelerators only.
accelerators = resources.accelerators
if accelerators is None:
# No requirements to filter, so just return a default VM type.
return (_make([AWS.get_default_instance_type()]),
fuzzy_candidate_list)
# Return a default instance type with the given number of vCPUs.
default_instance_type = AWS.get_default_instance_type(
cpus=resources.cpus)
if default_instance_type is None:
return ([], [])
else:
return (_make([default_instance_type]), [])

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -368,6 +372,7 @@ def _make(instance_list):
acc,
acc_count,
use_spot=resources.use_spot,
cpus=resources.cpus,
region=resources.region,
zone=resources.zone,
clouds='aws')
Expand Down
26 changes: 16 additions & 10 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,10 @@ def is_same_cloud(self, other):
return isinstance(other, Azure)

@classmethod
def get_default_instance_type(cls) -> str:
# General-purpose instance with 8 vCPUs and 32 GB RAM.
# Intel Ice Lake 8370C
return 'Standard_D8_v5'
def get_default_instance_type(cls,
cpus: Optional[str] = None) -> Optional[str]:
return service_catalog.get_default_instance_type(cpus=cpus,
clouds='azure')

def _get_image_config(self, gen_version, instance_type):
# az vm image list \
Expand Down Expand Up @@ -250,12 +250,11 @@ def get_feasible_launchable_resources(self, resources):
# TODO(zhwu): our azure subscription offer ID does not support spot.
# Need to support it.
return ([], [])
fuzzy_candidate_list = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
return ([resources], [])

def _make(instance_list):
resource_list = []
Expand All @@ -265,23 +264,30 @@ def _make(instance_list):
instance_type=instance_type,
# Setting this to None as Azure doesn't separately bill /
# attach the accelerators. Billed as part of the VM type.
accelerators=None)
accelerators=None,
cpus=None,
)
resource_list.append(r)
return resource_list

# Currently, handle a filter on accelerators only.
accelerators = resources.accelerators
if accelerators is None:
# No requirements to filter, so just return a default VM type.
return (_make([Azure.get_default_instance_type()]),
fuzzy_candidate_list)
# Return a default instance type with the given number of vCPUs.
default_instance_type = Azure.get_default_instance_type(
cpus=resources.cpus)
if default_instance_type is None:
return ([], [])
else:
return (_make([default_instance_type]), [])

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
(instance_list, fuzzy_candidate_list
) = service_catalog.get_instance_type_for_accelerator(
acc,
acc_count,
cpus=resources.cpus,
use_spot=resources.use_spot,
region=resources.region,
zone=resources.zone,
Expand Down
Loading

0 comments on commit a50a357

Please sign in to comment.