Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backward Compatibility][Spot] Avoid cluster leakage by ray yaml overwritten and reduce spot controller cost on AWS #1235

Merged
merged 10 commits into from
Oct 14, 2022
76 changes: 71 additions & 5 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from ray.autoscaler._private import util as ray_util
import rich.console as rich_console
import rich.progress as rich_progress
import yaml

import sky
from sky import authentication as auth
Expand Down Expand Up @@ -109,12 +110,33 @@
# Remote dir that holds our runtime files.
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'

# Include the fields that will be used for generating tags that distinguishes the
# cluster in ray, to avoid the stopped cluster being discarded due to updates in
# the yaml template.
# Some notes on the fields:
# - 'provider' fields will be used for bootstrapping and insert more new items in
# 'node_config'.
# - keeping the auth is not enough becuase the content of the key file will be used
# for calculating the hash.
# TODO(zhwu): Keep in sync with the fields used in https://github.com/ray-project/ray/blob/e4ce38d001dbbe09cd21c497fedd03d692b2be3e/python/ray/autoscaler/_private/commands.py#L687-L701
_RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it true that none of the following contribute to the launch hash?

  • resources under ray.head.default:
  • any of
cluster_name: {{cluster_name}}

# The maximum number of workers nodes to launch in addition to the head node.
max_workers: {{num_nodes - 1}}
upscaling_speed: {{num_nodes - 1}}
idle_timeout_minutes: 60
  • head_node_type: ray.head.default

Just figuring out whether we should preserve these fields.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just added the cluster_name in the key set. For the resources and head_node_type, I think it would be better to add them when we actually want to modify those fields and find them affecting backward compatibility in the future.
As far as I understand, the max_workers, upscaling_speed and idle_timeout_minutes won't affect the launch hash.

'provider', 'auth', 'node_config'
}


def is_ip(s: str) -> bool:
"""Returns whether this string matches IP_ADDR_REGEX."""
return len(re.findall(IP_ADDR_REGEX, s)) == 1


def _get_yaml_path_from_cluster_name(cluster_name: str,
prefix: str = SKY_USER_FILE_PATH) -> str:
output_path = pathlib.Path(
prefix).expanduser().resolve() / f'{cluster_name}.yml'
os.makedirs(output_path.parents[0], exist_ok=True)
return str(output_path)


def fill_template(template_name: str,
variables: Dict,
output_path: Optional[str] = None,
Expand All @@ -129,10 +151,8 @@ def fill_template(template_name: str,
if output_path is None:
assert ('cluster_name' in variables), ('cluster_name is required.')
cluster_name = variables.get('cluster_name')
output_path = pathlib.Path(
output_prefix).expanduser() / f'{cluster_name}.yml'
os.makedirs(output_path.parents[0], exist_ok=True)
output_path = str(output_path)
output_path = _get_yaml_path_from_cluster_name(cluster_name,
output_prefix)
output_path = os.path.abspath(output_path)

# Add yaml file path to the template variables.
Expand Down Expand Up @@ -655,6 +675,32 @@ def _remove_multinode_config(
break


def _restore_original_block_for_yaml(new_yaml: str, old_yaml: str,
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
key_names: Set[str]) -> str:
"""Replaces 'new' with 'old' for all keys in key_names.

The replacement will be applied recursively and only for the blocks
with the key in key_names, and have the same ancestors in both 'new'
and 'old' YAML tree.
"""

def _restore_block(new_block: Dict[str, Any], old_block: Dict[str, Any]):
for key, value in new_block.items():
if key in key_names:
if key in old_block:
new_block[key] = old_block[key]
else:
del new_block[key]
elif isinstance(value, dict):
if key in old_block:
_restore_block(value, old_block[key])

new_config = yaml.safe_load(new_yaml)
old_config = yaml.safe_load(old_yaml)
_restore_block(new_config, old_config)
return common_utils.dump_yaml_str(new_config)


# TODO: too many things happening here - leaky abstraction. Refactor.
@timeline.event
def write_cluster_config(to_provision: 'resources.Resources',
Expand All @@ -666,7 +712,8 @@ def write_cluster_config(to_provision: 'resources.Resources',
region: Optional[clouds.Region] = None,
zones: Optional[List[clouds.Zone]] = None,
auth_config: Optional[Dict[str, str]] = None,
dryrun: bool = False) -> Dict[str, str]:
dryrun: bool = False,
force_overwrite: bool = True) -> Dict[str, str]:
"""Fills in cluster configuration templates and writes them out.

Returns: {provisioner: path to yaml, the provisioning spec}.
Expand Down Expand Up @@ -700,6 +747,15 @@ def write_cluster_config(to_provision: 'resources.Resources',
auth_config = onprem_utils.get_local_auth_config(cluster_name)
region_name = resources_vars.get('region')

yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
old_yaml_content = None
if os.path.exists(yaml_path):
if force_overwrite:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When would be the case that the cluster didn't exist but this file exists? If this is an exceptional case to guard against, rename it to keep_launch_fields_in_existing_config: bool?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The configure yaml files may not be correctly deleted if an error happens during sky.down. I did find multiple yaml files in the folder ~/.sky/generated not belonging to any existing cluster.
Good point! Let me rename the variable.

os.unlink(yaml_path)
else:
with open(yaml_path, 'r') as f:
old_yaml_content = f.read()

yaml_path = fill_template(
cluster_config_template,
dict(
Expand Down Expand Up @@ -750,6 +806,16 @@ def write_cluster_config(to_provision: 'resources.Resources',
# been fully tested yet.
_optimize_file_mounts(yaml_path)

# Restore the old yaml content for backward compatibility.
if old_yaml_content is not None:
with open(yaml_path, 'r') as f:
new_yaml_content = f.read()
restored_yaml_content = _restore_original_block_for_yaml(
old_yaml_content, new_yaml_content,
_RAY_YAML_KEYS_TO_RESTORE_FOR_BACK_COMPATIBILITY)
with open(yaml_path, 'w') as f:
f.write(restored_yaml_content)

usage_lib.messages.usage.update_ray_yaml(yaml_path)
# For TPU nodes. TPU VMs do not need TPU_NAME.
if (resources_vars.get('tpu_type') is not None and
Expand Down
3 changes: 2 additions & 1 deletion sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,7 +924,8 @@ def _retry_region_zones(self,
self._wheel_hash,
region=region,
zones=zones,
dryrun=dryrun)
dryrun=dryrun,
force_overwrite=prev_cluster_status is None)
if dryrun:
return
cluster_config_file = config_dict['ray']
Expand Down
8 changes: 8 additions & 0 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,15 @@ available_node_types:
node_config:
InstanceType: {{instance_type}}
ImageId: {{image_id}} # Deep Learning AMI (Ubuntu 18.04); see aws.py.
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: {{disk_size}}
# use default Iops for gp3
VolumeType: gp3
Iops: 3000
Throughput: 125
{% if use_spot %}
InstanceMarketOptions:
MarketType: spot
Expand All @@ -49,6 +54,9 @@ available_node_types:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: {{disk_size}}
VolumeType: gp3
Iops: 3000
Throughput: 125
{% if use_spot %}
InstanceMarketOptions:
MarketType: spot
Expand Down
1 change: 1 addition & 0 deletions sky/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import time
import uuid
from typing import Dict, List, Union

import yaml

from sky import sky_logging
Expand Down
89 changes: 49 additions & 40 deletions tests/backward_comaptibility_tests.sh
Original file line number Diff line number Diff line change
@@ -1,71 +1,78 @@
#!/bin/bash
set -ex
set -ev

CLUSTER_NAME="test-back-compat"
. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh
source ~/.bashrc
CLUSTER_NAME="test-back-compat-$USER"
source $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh

git clone https://github.com/skypilot-org/skypilot.git sky-master || true


# Create environment for compatibility tests
conda create -n sky-back-compat-master -y --clone sky-dev
conda create -n sky-back-compat-current -y --clone sky-dev

conda activate sky-back-compat-master
rm -r ~/.sky/sky_wheels || true
pip uninstall skypilot -y

mamba > /dev/null 2>&1 || conda install -n base -c conda-forge mamba -y
mamba init
source ~/.bashrc
mamba env list | grep sky-back-compat-master || mamba create -n sky-back-compat-master -y python=3.9

mamba activate sky-back-compat-master
mamba install -c conda-forge google-cloud-sdk -y
rm -r ~/.sky/wheels || true
cd sky-master
git pull origin master
pip install -e ".[all]"
cd -

conda activate sky-back-compat-current
rm -r ~/.sky/sky_wheels || true
mamba env list | grep sky-back-compat-current || mamba create -n sky-back-compat-current -y --clone sky-back-compat-master
mamba activate sky-back-compat-current
rm -r ~/.sky/wheels || true
pip uninstall -y skypilot
pip install -e ".[all]"


# exec + launch
conda activate sky-back-compat-master
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-master
rm -r ~/.sky/wheels || true
which sky
sky launch --cloud gcp -c ${CLUSTER_NAME} examples/minimal.yaml
sky launch --cloud gcp -y -c ${CLUSTER_NAME} examples/minimal.yaml

conda activate sky-back-compat-current
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-current
rm -r ~/.sky/wheels || true
sky exec --cloud gcp ${CLUSTER_NAME} examples/minimal.yaml
sky launch --cloud gcp -c ${CLUSTER_NAME} examples/minimal.yaml
sky queue
s=$(sky launch --cloud gcp -d -c ${CLUSTER_NAME} examples/minimal.yaml)
echo $s
echo $s | grep "Job ID: 3"
sky queue ${CLUSTER_NAME}

# sky stop + sky start + sky exec
conda activate sky-back-compat-master
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-master
rm -r ~/.sky/wheels || true
sky launch --cloud gcp -y -c ${CLUSTER_NAME}-2 examples/minimal.yaml
conda activate sky-back-compat-current
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-current
rm -r ~/.sky/wheels || true
sky stop -y ${CLUSTER_NAME}-2
sky start -y ${CLUSTER_NAME}-2
sky exec --cloud gcp ${CLUSTER_NAME}-2 examples/minimal.yaml
s=$(sky exec --cloud gcp -d ${CLUSTER_NAME}-2 examples/minimal.yaml)
echo $s
echo $s | grep "Job ID: 2"

# `sky autostop` + `sky status -r`
conda activate sky-back-compat-master
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-master
rm -r ~/.sky/wheels || true
sky launch --cloud gcp -y -c ${CLUSTER_NAME}-3 examples/minimal.yaml
conda activate sky-back-compat-current
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-current
rm -r ~/.sky/wheels || true
sky autostop -y -i0 ${CLUSTER_NAME}-3
sleep 100
sky status -r | grep ${CLUSTER_NAME}-3 | grep STOPPED


# (1 node) sky launch + sky exec + sky queue + sky logs
conda activate sky-back-compat-master
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-master
rm -r ~/.sky/wheels || true
sky launch --cloud gcp -y -c ${CLUSTER_NAME}-4 examples/minimal.yaml
sky stop -y ${CLUSTER_NAME}-4
conda activate sky-back-compat-current
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-current
rm -r ~/.sky/wheels || true
sky launch --cloud gcp -y -c ${CLUSTER_NAME}-4 examples/minimal.yaml
sky queue ${CLUSTER_NAME}-4
sky logs ${CLUSTER_NAME}-4 1 --status
Expand All @@ -74,12 +81,12 @@ sky logs ${CLUSTER_NAME}-4 1
sky logs ${CLUSTER_NAME}-4 2

# (1 node) sky start + sky exec + sky queue + sky logs
conda activate sky-back-compat-master
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-master
rm -r ~/.sky/wheels || true
sky launch --cloud gcp -y -c ${CLUSTER_NAME}-5 examples/minimal.yaml
sky stop -y ${CLUSTER_NAME}-5
conda activate sky-back-compat-current
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-current
rm -r ~/.sky/wheels || true
sky start -y ${CLUSTER_NAME}-5
sky queue ${CLUSTER_NAME}-5
sky logs ${CLUSTER_NAME}-5 1 --status
Expand All @@ -90,12 +97,12 @@ sky logs ${CLUSTER_NAME}-5 2 --status
sky logs ${CLUSTER_NAME}-5 2

# (2 nodes) sky launch + sky exec + sky queue + sky logs
conda activate sky-back-compat-master
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-master
rm -r ~/.sky/wheels || true
sky launch --cloud gcp -y -c ${CLUSTER_NAME}-6 examples/multi_hostname.yaml
sky stop -y ${CLUSTER_NAME}-6
conda activate sky-back-compat-current
rm -r ~/.sky/sky_wheels || true
mamba activate sky-back-compat-current
rm -r ~/.sky/wheels || true
sky start -y ${CLUSTER_NAME}-6
sky queue ${CLUSTER_NAME}-6
sky logs ${CLUSTER_NAME}-6 1 --status
Expand All @@ -104,3 +111,5 @@ sky exec --cloud gcp ${CLUSTER_NAME}-6 examples/multi_hostname.yaml
sky queue ${CLUSTER_NAME}-6
sky logs ${CLUSTER_NAME}-6 2 --status
sky logs ${CLUSTER_NAME}-6 2

sky down ${CLUSTER_NAME}* -y