From 06afd93a0b7eec8185ddcee855d49d569137c693 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 28 Aug 2022 14:58:55 -0700 Subject: [PATCH] update ray node provider to 2.0.0 update patches Adapt to ray functions in 2.0.0 update azure-cli version for faster installation format [Onprem] Automatically install sky dependencies (#1116) * Remove root user, move ray cluster to admin * Automatically install sky dependencies * Fix admin alignment * Fix PR * Address romil's comments * F * Addressed Romil's comments Add `--retry-until-up`, `--region`, `--zone`, and `--idle-minutes-to-autostop` for interactive nodes (#1207) * Add --retry-until-up flag for interactive nodes * Add --region flag for interactive nodes * Add --idle-minutes-to-autostop flag for interactive nodes * Add --zone flag for interactive nodes * Update help messages * Address nit Add all region option in catalog fetcher and speed up azure fetcher (#1204) * Port changes * format * add t2a exclusion back * fix A100 for GCP * fix aws fetching for p4de.24xlarge * Fill GPUInfo * fix * address part of comments * address comments * add test for A100 * patch GpuInfo * Add generation info * Add capabilities back to azure and fix aws * fix azure catalog * format * lint * remove zone from azure * fix azure * Add analyze for csv * update catalog analysis * format * backward compatible for azure_catalog * yapf * fix GCP catalog * fix A100-80GB * format * increase version number * only keep useful columns for aws * remove capabilities from azure * add az to AWS Revert "Add `--retry-until-up`, `--region`, `--zone`, and `--idle-minutes-to-autostop` for interactive nodes" (#1220) Revert "Add `--retry-until-up`, `--region`, `--zone`, and `--idle-minutes-to-autostop` for interactive nodes (#1207)" This reverts commit f06416d16e5e62af37118352a72fbfba4a954486. [Storage] Add `StorageMode` to __init__ (#1223) * Add storage mode to __init__ * fix [Example] Minimal containerized app example (#1212) * Container example * parenthesis * Add explicit StorageMode * lint Fix Mac Version in Setup.py (#1224) Fix mac Reduce iops for aws instances (#1221) * set the default iops to be same as console for AWS * fix Revert "Reduce iops for aws instances" (#1229) Revert "Reduce iops for aws instances (#1221)" This reverts commit 29f145863daee694b4221c62380664727c14db51. update back compat test --- sky/backends/cloud_vm_ray_backend.py | 6 ++- sky/setup_files/setup.py | 4 +- sky/skylet/LICENCE | 12 ++--- sky/skylet/constants.py | 2 +- sky/skylet/job_lib.py | 20 ++++++-- .../aws/cloudwatch/cloudwatch_helper.py | 12 +++-- sky/skylet/providers/aws/config.py | 46 +++++++++--------- sky/skylet/providers/aws/node_provider.py | 48 ++++++++++--------- sky/skylet/providers/aws/utils.py | 12 +++-- sky/skylet/providers/azure/config.py | 2 +- sky/skylet/providers/azure/node_provider.py | 12 ++--- sky/skylet/providers/gcp/config.py | 10 ++-- sky/skylet/providers/gcp/node.py | 11 +++-- sky/skylet/providers/gcp/node_provider.py | 40 ++++++++++------ sky/skylet/ray_patches/__init__.py | 2 +- sky/skylet/ray_patches/autoscaler.py.patch | 4 +- sky/skylet/ray_patches/cli.py.patch | 6 +-- .../ray_patches/command_runner.py.patch | 2 +- sky/skylet/ray_patches/job_manager.py.patch | 5 +- sky/skylet/ray_patches/log_monitor.py.patch | 7 ++- .../resource_demand_scheduler.py.patch | 6 +-- sky/skylet/ray_patches/worker.py.patch | 8 ++-- 22 files changed, 154 insertions(+), 123 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index fc22bdf91a6..60131b093a1 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -401,6 +401,8 @@ def add_epilogue(self) -> None: # Need this to set the job status in ray job to be FAILED. sys.exit(1) else: + sys.stdout.flush() + sys.stderr.flush() job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED) # This waits for all streaming logs to finish. time.sleep(1) @@ -2066,8 +2068,8 @@ def _exec_code_on_head( else: job_submit_cmd = ( f'{cd} && mkdir -p {remote_log_dir} && ray job submit ' - f'--address=http://127.0.0.1:8265 --job-id {ray_job_id} ' - '--no-wait -- ' + f'--address=http://127.0.0.1:8265 --submission-id {ray_job_id} ' + '--no-wait ' f'"{executable} -u {script_path} > {remote_log_path} 2>&1"') returncode, stdout, stderr = self.run_on_head(handle, diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index a5b3c4ae00f..ce43432a0c2 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -68,7 +68,7 @@ def parse_footnote(readme: str) -> str: 'PrettyTable', # Lower local ray version is not fully supported, due to the # autoscaler issues (also tracked in #537). - 'ray[default]>=1.9.0,<=1.13.0', + 'ray[default]>=1.9.0', 'rich', 'tabulate', 'filelock', # TODO(mraheja): Enforce >=3.6.0 when python version is >= 3.7 @@ -96,7 +96,7 @@ def parse_footnote(readme: str) -> str: ], # TODO(zongheng): azure-cli is huge and takes a long time to install. # Tracked in: https://github.com/Azure/azure-cli/issues/7387 - 'azure': ['azure-cli==2.31.0', 'azure-core'], + 'azure': ['azure-cli==2.39.0', 'azure-core'], 'gcp': ['google-api-python-client', 'google-cloud-storage'], 'docker': ['docker'], } diff --git a/sky/skylet/LICENCE b/sky/skylet/LICENCE index 0dc90ab2cf0..e9a3e18018d 100644 --- a/sky/skylet/LICENCE +++ b/sky/skylet/LICENCE @@ -203,16 +203,16 @@ -------------------------------------------------------------------------------- Code in providers/azure from -https://github.com/ray-project/ray/tree/ray-1.13.0/python/ray/autoscaler/_private/_azure -Git commit of the release 1.13.0: 4ce38d001dbbe09cd21c497fedd03d692b2be3e +https://github.com/ray-project/ray/tree/ray-2.0.0/python/ray/autoscaler/_private/_azure +Git commit of the release 2.0.0: cba26cc83f6b5b8a2ff166594a65cb74c0ec8740 Code in providers/gcp from -https://github.com/ray-project/ray/tree/ray-1.13.0/python/ray/autoscaler/_private/gcp -Git commit of the release 1.13.0: 4ce38d001dbbe09cd21c497fedd03d692b2be3e +https://github.com/ray-project/ray/tree/ray-2.0.0/python/ray/autoscaler/_private/gcp +Git commit of the release 2.0.0: cba26cc83f6b5b8a2ff166594a65cb74c0ec8740 Code in providers/aws from -https://github.com/ray-project/ray/tree/ray-1.13.0/python/ray/autoscaler/_private/aws -Git commit of the release 1.13.0: 4ce38d001dbbe09cd21c497fedd03d692b2be3e +https://github.com/ray-project/ray/tree/ray-2.0.0/python/ray/autoscaler/_private/aws +Git commit of the release 2.0.0: cba26cc83f6b5b8a2ff166594a65cb74c0ec8740 Copyright 2016-2022 Ray developers diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index c0d98ecef2b..2ffc1fd26db 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -2,7 +2,7 @@ SKY_LOGS_DIRECTORY = '~/sky_logs' SKY_REMOTE_WORKDIR = '~/sky_workdir' -SKY_REMOTE_RAY_VERSION = '1.13.0' +SKY_REMOTE_RAY_VERSION = '2.0.0' # TODO(mluo): Make explicit `sky launch -c ''` optional. UNINITIALIZED_ONPREM_CLUSTER_MESSAGE = ( diff --git a/sky/skylet/job_lib.py b/sky/skylet/job_lib.py index ddd8ca95721..cf7b2b74ae7 100644 --- a/sky/skylet/job_lib.py +++ b/sky/skylet/job_lib.py @@ -7,6 +7,7 @@ import pathlib import shlex import time +import typing from typing import Any, Dict, List, Optional import filelock @@ -18,6 +19,9 @@ from sky.utils import db_utils from sky.utils import log_utils +if typing.TYPE_CHECKING: + from ray.dashboard.modules.job.pydantic_models import JobDetails + logger = sky_logging.init_logger(__name__) _JOB_STATUS_LOCK = '~/.sky/locks/.job_{}.lock' @@ -342,13 +346,19 @@ def update_job_status(job_owner: str, job_client = _create_ray_job_submission_client() - # In ray 1.13.0, job_client.list_jobs returns a dict of job_id to job_info, - # where job_info contains the job status (str). - ray_job_infos = job_client.list_jobs() + # In ray 2.0.0, job_client.list_jobs returns a list of JobDetails, + # which contains the job status (str) and submission_id (str). + job_details_list: List['JobDetails'] = job_client.list_jobs() + + job_details = dict() + ray_job_ids_set = set(ray_job_ids) + for job_detail in job_details_list: + if job_detail.submission_id in ray_job_ids_set: + job_details[job_detail.submission_id] = job_detail job_statuses: List[JobStatus] = [None] * len(ray_job_ids) for i, ray_job_id in enumerate(ray_job_ids): - if ray_job_id in ray_job_infos: - ray_status = ray_job_infos[ray_job_id].status + if ray_job_id in job_details: + ray_status = job_details[ray_job_id].status job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status] assert len(job_statuses) == len(job_ids), (job_statuses, job_ids) diff --git a/sky/skylet/providers/aws/cloudwatch/cloudwatch_helper.py b/sky/skylet/providers/aws/cloudwatch/cloudwatch_helper.py index b87805d6d4b..526cd19d1ae 100644 --- a/sky/skylet/providers/aws/cloudwatch/cloudwatch_helper.py +++ b/sky/skylet/providers/aws/cloudwatch/cloudwatch_helper.py @@ -1,13 +1,15 @@ -import botocore import copy +import hashlib import json -import os import logging +import os import time -import hashlib -from typing import Any, Dict, List, Union, Tuple +from typing import Any, Dict, List, Tuple, Union + +import botocore + from sky.skylet.providers.aws.utils import client_cache, resource_cache -from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, NODE_KIND_HEAD, TAG_RAY_NODE_KIND +from ray.autoscaler.tags import NODE_KIND_HEAD, TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_KIND logger = logging.getLogger(__name__) diff --git a/sky/skylet/providers/aws/config.py b/sky/skylet/providers/aws/config.py index abdd487566e..6de35659d4b 100644 --- a/sky/skylet/providers/aws/config.py +++ b/sky/skylet/providers/aws/config.py @@ -1,30 +1,29 @@ -from distutils.version import StrictVersion -from functools import lru_cache -from functools import partial import copy import itertools import json +import logging import os import time +from distutils.version import StrictVersion +from functools import lru_cache, partial from typing import Any, Dict, List, Optional, Set, Tuple -import logging import boto3 import botocore -from ray.autoscaler._private.util import check_legacy_fields -from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER -from ray.autoscaler._private.providers import _PROVIDER_PRETTY_NAMES +from sky.skylet.providers.aws.cloudwatch.cloudwatch_helper import ( + CloudwatchHelper as cwh, +) from sky.skylet.providers.aws.utils import ( LazyDefaultDict, handle_boto_error, resource_cache, ) -from ray.autoscaler._private.cli_logger import cli_logger, cf +from ray.autoscaler._private.cli_logger import cf, cli_logger from ray.autoscaler._private.event_system import CreateClusterEvent, global_event_system -from sky.skylet.providers.aws.cloudwatch.cloudwatch_helper import ( - CloudwatchHelper as cwh, -) +from ray.autoscaler._private.providers import _PROVIDER_PRETTY_NAMES +from ray.autoscaler._private.util import check_legacy_fields +from ray.autoscaler.tags import NODE_TYPE_LEGACY_HEAD, NODE_TYPE_LEGACY_WORKER logger = logging.getLogger(__name__) @@ -33,20 +32,21 @@ DEFAULT_RAY_IAM_ROLE = RAY + "-v1" SECURITY_GROUP_TEMPLATE = RAY + "-{}" -DEFAULT_AMI_NAME = "AWS Deep Learning AMI (Ubuntu 18.04) V30.0" +# V61.0 has CUDA 11.2 +DEFAULT_AMI_NAME = "AWS Deep Learning AMI (Ubuntu 18.04) V61.0" -# Obtained from https://aws.amazon.com/marketplace/pp/B07Y43P7X5 on 8/4/2020. +# Obtained from https://aws.amazon.com/marketplace/pp/B07Y43P7X5 on 6/10/2022. DEFAULT_AMI = { - "us-east-1": "ami-029510cec6d69f121", # US East (N. Virginia) - "us-east-2": "ami-08bf49c7b3a0c761e", # US East (Ohio) - "us-west-1": "ami-0cc472544ce594a19", # US West (N. California) - "us-west-2": "ami-0a2363a9cff180a64", # US West (Oregon) - "ca-central-1": "ami-0a871851b2ab39f01", # Canada (Central) - "eu-central-1": "ami-049fb1ea198d189d7", # EU (Frankfurt) - "eu-west-1": "ami-0abcbc65f89fb220e", # EU (Ireland) - "eu-west-2": "ami-0755b39fd4dab7cbe", # EU (London) - "eu-west-3": "ami-020485d8df1d45530", # EU (Paris) - "sa-east-1": "ami-058a6883cbdb4e599", # SA (Sao Paulo) + "us-east-1": "ami-0dd6adfad4ad37eec", # US East (N. Virginia) + "us-east-2": "ami-0c77cd5ca05bf1281", # US East (Ohio) + "us-west-1": "ami-020ab1b368a5ed1db", # US West (N. California) + "us-west-2": "ami-0387d929287ab193e", # US West (Oregon) + "ca-central-1": "ami-07dbafdbd38f18d98", # Canada (Central) + "eu-central-1": "ami-0383bd0c1fc4c63ec", # EU (Frankfurt) + "eu-west-1": "ami-0a074b0a311a837ac", # EU (Ireland) + "eu-west-2": "ami-094ba2b4651f761ca", # EU (London) + "eu-west-3": "ami-031da10fbf225bf5f", # EU (Paris) + "sa-east-1": "ami-0be7c1f1dd96d7337", # SA (Sao Paulo) } # todo: cli_logger should handle this assert properly diff --git a/sky/skylet/providers/aws/node_provider.py b/sky/skylet/providers/aws/node_provider.py index 96335a7beba..7a2a16cdf09 100644 --- a/sky/skylet/providers/aws/node_provider.py +++ b/sky/skylet/providers/aws/node_provider.py @@ -1,36 +1,39 @@ import copy -import threading -from collections import defaultdict, OrderedDict import logging +import threading import time +from collections import defaultdict, OrderedDict from typing import Any, Dict, List import botocore +from boto3.resources.base import ServiceResource -from ray.autoscaler.node_provider import NodeProvider -from ray.autoscaler.tags import ( - TAG_RAY_CLUSTER_NAME, - TAG_RAY_NODE_NAME, - TAG_RAY_LAUNCH_CONFIG, - TAG_RAY_NODE_KIND, - TAG_RAY_USER_NODE_TYPE, +try: + import ray._private.ray_constants as ray_constants +except ImportError: + # SkyPilot: for local ray version lower than 2.0.0 + import ray.ray_constants as ray_constants +from sky.skylet.providers.aws.cloudwatch.cloudwatch_helper import ( + CloudwatchHelper, + CLOUDWATCH_AGENT_INSTALLED_AMI_TAG, + CLOUDWATCH_AGENT_INSTALLED_TAG, ) -from ray.autoscaler._private.constants import BOTO_MAX_RETRIES, BOTO_CREATE_MAX_RETRIES from sky.skylet.providers.aws.config import bootstrap_aws -from ray.autoscaler._private.log_timer import LogTimer - from sky.skylet.providers.aws.utils import ( boto_exception_handler, resource_cache, client_cache, ) from ray.autoscaler._private.cli_logger import cli_logger, cf -import ray.ray_constants as ray_constants - -from sky.skylet.providers.aws.cloudwatch.cloudwatch_helper import ( - CloudwatchHelper, - CLOUDWATCH_AGENT_INSTALLED_AMI_TAG, - CLOUDWATCH_AGENT_INSTALLED_TAG, +from ray.autoscaler._private.constants import BOTO_MAX_RETRIES, BOTO_CREATE_MAX_RETRIES +from ray.autoscaler._private.log_timer import LogTimer +from ray.autoscaler.node_provider import NodeProvider +from ray.autoscaler.tags import ( + TAG_RAY_CLUSTER_NAME, + TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, + TAG_RAY_USER_NODE_TYPE, ) logger = logging.getLogger(__name__) @@ -56,7 +59,7 @@ def from_aws_format(tags): return tags -def make_ec2_client(region, max_retries, aws_credentials=None): +def make_ec2_resource(region, max_retries, aws_credentials=None): """Make client, retrying requests up to `max_retries`.""" aws_credentials = aws_credentials or {} return resource_cache("ec2", region, max_retries, **aws_credentials) @@ -67,7 +70,7 @@ def list_ec2_instances( ) -> List[Dict[str, Any]]: """Get all instance-types/resources available in the user's AWS region. Args: - region (str): the region of the AWS provider. e.g., "us-west-2". + region: the region of the AWS provider. e.g., "us-west-2". Returns: final_instance_types: a list of instances. An example of one element in the list: @@ -101,12 +104,12 @@ def __init__(self, provider_config, cluster_name): self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) aws_credentials = provider_config.get("aws_credentials") - self.ec2 = make_ec2_client( + self.ec2 = make_ec2_resource( region=provider_config["region"], max_retries=BOTO_MAX_RETRIES, aws_credentials=aws_credentials, ) - self.ec2_fail_fast = make_ec2_client( + self.ec2_fail_fast = make_ec2_resource( region=provider_config["region"], max_retries=0, aws_credentials=aws_credentials, @@ -494,7 +497,6 @@ def terminate_node(self, node_id): # asyncrhonous or error, which would result in a use after free error. # If this leak becomes bad, we can garbage collect the tag cache when # the node cache is updated. - pass def _check_ami_cwa_installation(self, config): response = self.ec2.meta.client.describe_images(ImageIds=[config["ImageId"]]) diff --git a/sky/skylet/providers/aws/utils.py b/sky/skylet/providers/aws/utils.py index 154931548ce..72dda831a80 100644 --- a/sky/skylet/providers/aws/utils.py +++ b/sky/skylet/providers/aws/utils.py @@ -1,11 +1,13 @@ from collections import defaultdict from functools import lru_cache +import boto3 from boto3.exceptions import ResourceNotExistsError +from boto3.resources.base import ServiceResource +from botocore.client import BaseClient from botocore.config import Config -import boto3 -from ray.autoscaler._private.cli_logger import cli_logger, cf +from ray.autoscaler._private.cli_logger import cf, cli_logger from ray.autoscaler._private.constants import BOTO_MAX_RETRIES @@ -141,7 +143,9 @@ def __exit__(self, type, value, tb): @lru_cache() -def resource_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs): +def resource_cache( + name, region, max_retries=BOTO_MAX_RETRIES, **kwargs +) -> ServiceResource: cli_logger.verbose( "Creating AWS resource `{}` in `{}`", cf.bold(name), cf.bold(region) ) @@ -157,7 +161,7 @@ def resource_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs): @lru_cache() -def client_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs): +def client_cache(name, region, max_retries=BOTO_MAX_RETRIES, **kwargs) -> BaseClient: try: # try to re-use a client from the resource cache first return resource_cache(name, region, max_retries, **kwargs).meta.client diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py index a8a9ca08879..530d15e4b00 100644 --- a/sky/skylet/providers/azure/config.py +++ b/sky/skylet/providers/azure/config.py @@ -1,7 +1,7 @@ import json import logging -from pathlib import Path import random +from pathlib import Path from typing import Any, Callable from azure.common.credentials import get_cli_profile diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 6bd7a9f2394..630b5993474 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -10,18 +10,18 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.skylet.providers.azure.config import ( + bootstrap_azure, + get_azure_sdk_function, +) from ray.autoscaler.node_provider import NodeProvider from ray.autoscaler.tags import ( TAG_RAY_CLUSTER_NAME, - TAG_RAY_NODE_NAME, - TAG_RAY_NODE_KIND, TAG_RAY_LAUNCH_CONFIG, + TAG_RAY_NODE_KIND, + TAG_RAY_NODE_NAME, TAG_RAY_USER_NODE_TYPE, ) -from sky.skylet.providers.azure.config import ( - bootstrap_azure, - get_azure_sdk_function, -) VM_NAME_MAX_LEN = 64 VM_NAME_UUID_LEN = 8 diff --git a/sky/skylet/providers/gcp/config.py b/sky/skylet/providers/gcp/config.py index 539f9d67736..42ab7caf517 100644 --- a/sky/skylet/providers/gcp/config.py +++ b/sky/skylet/providers/gcp/config.py @@ -1,19 +1,19 @@ import copy -from functools import partial import json -import os import logging +import os import time +from functools import partial +from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives import serialization from cryptography.hazmat.primitives.asymmetric import rsa -from cryptography.hazmat.backends import default_backend -from googleapiclient import discovery, errors from google.oauth2 import service_account from google.oauth2.credentials import Credentials as OAuthCredentials +from googleapiclient import discovery, errors +from sky.skylet.providers.gcp.node import MAX_POLLS, POLL_INTERVAL, GCPNodeType from ray.autoscaler._private.util import check_legacy_fields -from sky.skylet.providers.gcp.node import GCPNodeType, MAX_POLLS, POLL_INTERVAL logger = logging.getLogger(__name__) diff --git a/sky/skylet/providers/gcp/node.py b/sky/skylet/providers/gcp/node.py index aac8e5f7f85..5dec7451c08 100644 --- a/sky/skylet/providers/gcp/node.py +++ b/sky/skylet/providers/gcp/node.py @@ -23,16 +23,17 @@ class inheriting from ``GCPNode``. Those classes are essentially dicts node provider. """ -from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple, Union -import logging import abc -import time +import logging import re -from uuid import uuid4 +import time + from collections import UserDict +from copy import deepcopy from enum import Enum from functools import wraps +from typing import Any, Dict, List, Optional, Tuple, Union +from uuid import uuid4 from googleapiclient.discovery import Resource from googleapiclient.errors import HttpError diff --git a/sky/skylet/providers/gcp/node_provider.py b/sky/skylet/providers/gcp/node_provider.py index de1da31f7e8..7257a87c8c2 100644 --- a/sky/skylet/providers/gcp/node_provider.py +++ b/sky/skylet/providers/gcp/node_provider.py @@ -1,37 +1,39 @@ -from typing import Dict +import logging +import time import copy from functools import wraps from threading import RLock -import time -import logging +from typing import Dict, List, Tuple import googleapiclient -from ray.autoscaler.node_provider import NodeProvider -from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_NODE_KIND, - TAG_RAY_USER_NODE_TYPE) -from ray.autoscaler._private.cli_logger import cli_logger, cf - from sky.skylet.providers.gcp.config import ( bootstrap_gcp, construct_clients_from_provider_config, get_node_type, ) +from ray.autoscaler.tags import (TAG_RAY_LAUNCH_CONFIG, TAG_RAY_NODE_KIND, + TAG_RAY_USER_NODE_TYPE) +from ray.autoscaler._private.cli_logger import cf, cli_logger + + # The logic has been abstracted away here to allow for different GCP resources # (API endpoints), which can differ widely, making it impossible to use # the same logic for everything. from sky.skylet.providers.gcp.node import ( # noqa - GCPResource, - GCPNode, GCPCompute, - GCPTPU, + GCPNode, GCPNodeType, + GCPResource, + GCPTPU, + # Added by SkyPilot INSTANCE_NAME_MAX_LEN, INSTANCE_NAME_UUID_LEN, MAX_POLLS_STOP, POLL_INTERVAL, ) +from ray.autoscaler.node_provider import NodeProvider logger = logging.getLogger(__name__) @@ -173,8 +175,14 @@ def internal_ip(self, node_id: str): return ip @_retry - def create_node(self, base_config: dict, tags: dict, count: int) -> None: + def create_node(self, base_config: dict, tags: dict, count: int) -> Dict[str, dict]: + """Creates instances. + + Returns dict mapping instance id to each create operation result for the created + instances. + """ with self.lock: + result_dict = dict() labels = tags # gcp uses "labels" instead of aws "tags" labels = dict(sorted(copy.deepcopy(labels).items())) @@ -211,12 +219,16 @@ def create_node(self, base_config: dict, tags: dict, count: int) -> None: "under `provider` in the cluster configuration." ) for node_id in reuse_node_ids: - resource.start_instance(node_id) + result = resource.start_instance(node_id) + result_dict[node_id] = {node_id: result} for node_id in reuse_node_ids: self.set_node_tags(node_id, tags) count -= len(reuse_node_ids) if count: - resource.create_instances(base_config, labels, count) + results = resource.create_instances(base_config, labels, count) + result_dict.update({instance_id: result for result, instance_id in results}) + return result_dict + @_retry def terminate_node(self, node_id: str): diff --git a/sky/skylet/ray_patches/__init__.py b/sky/skylet/ray_patches/__init__.py index 8ab9b73e203..af9e2ce6fd8 100644 --- a/sky/skylet/ray_patches/__init__.py +++ b/sky/skylet/ray_patches/__init__.py @@ -60,7 +60,7 @@ def patch() -> None: from ray._private import log_monitor _run_patch(log_monitor.__file__, _to_absolute('log_monitor.py.patch')) - from ray import worker + from ray._private import worker _run_patch(worker.__file__, _to_absolute('worker.py.patch')) from ray.dashboard.modules.job import cli diff --git a/sky/skylet/ray_patches/autoscaler.py.patch b/sky/skylet/ray_patches/autoscaler.py.patch index 123979934c9..0c64da58580 100644 --- a/sky/skylet/ray_patches/autoscaler.py.patch +++ b/sky/skylet/ray_patches/autoscaler.py.patch @@ -1,9 +1,9 @@ 0a1,4 -> # From https://github.com/ray-project/ray/blob/releases/1.13.0/python/ray/autoscaler/_private/autoscaler.py +> # From https://github.com/ray-project/ray/blob/ray-2.0.0/python/ray/autoscaler/_private/autoscaler.py > # Sky patch changes: > # - enable upscaling_speed to be 0.0 > -915c919 +1022c1026 < if upscaling_speed: --- > if upscaling_speed is not None: # NOTE(sky): enable 0.0 diff --git a/sky/skylet/ray_patches/cli.py.patch b/sky/skylet/ray_patches/cli.py.patch index ae7ee2bd2a9..a3d75af545c 100644 --- a/sky/skylet/ray_patches/cli.py.patch +++ b/sky/skylet/ray_patches/cli.py.patch @@ -1,11 +1,11 @@ 0a1,4 -> # From https://github.com/ray-project/ray/blob/ray-1.13.0/dashboard/modules/job/cli.py +> # Adapted from https://github.com/ray-project/ray/blob/ray-2.0.0/dashboard/modules/job/cli.py > # Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514 > # Otherwise, the output redirection ">" will not work. > -4d7 +5d8 < from subprocess import list2cmdline -153c156 +182c185 < entrypoint=list2cmdline(entrypoint), --- > entrypoint=" ".join(entrypoint), diff --git a/sky/skylet/ray_patches/command_runner.py.patch b/sky/skylet/ray_patches/command_runner.py.patch index 2f47022c661..899f2e02d75 100644 --- a/sky/skylet/ray_patches/command_runner.py.patch +++ b/sky/skylet/ray_patches/command_runner.py.patch @@ -1,4 +1,4 @@ -330c330 +329c329 < "ControlPersist": "10s", --- > "ControlPersist": "300s", diff --git a/sky/skylet/ray_patches/job_manager.py.patch b/sky/skylet/ray_patches/job_manager.py.patch index 30500cca653..2e7896a3280 100644 --- a/sky/skylet/ray_patches/job_manager.py.patch +++ b/sky/skylet/ray_patches/job_manager.py.patch @@ -1,9 +1,9 @@ 0a1,4 -> # Adapted from https://github.com/ray-project/ray/blob/ray-1.13.0/dashboard/modules/job/job_manager.py +> # Adapted from https://github.com/ray-project/ray/blob/ray-2.0.0/dashboard/modules/job/job_manager.py > # Fixed the problem where the _monitor_job thread is leaked, due to `await job_supervisor.ping.remote()` > # does not raise an exception after the job_supervisor is exited, causing the dashboard to hang. > -334c338,349 +384c388,398 < await job_supervisor.ping.remote() --- > # Simulate the await behavior, in case some unexpected exception happens @@ -17,4 +17,3 @@ > ray.get(ready) > else: > await asyncio.sleep(0) -> diff --git a/sky/skylet/ray_patches/log_monitor.py.patch b/sky/skylet/ray_patches/log_monitor.py.patch index fa63f2e7316..b434c9968ac 100644 --- a/sky/skylet/ray_patches/log_monitor.py.patch +++ b/sky/skylet/ray_patches/log_monitor.py.patch @@ -1,9 +1,8 @@ -0a1,4 -> # Adapted from https://github.com/ray-project/ray/blob/ray-1.13.0/python/ray/_private/log_monitor.py +0a1,3 +> # Adapted from https://github.com/ray-project/ray/blob/ray-2.0.0/python/ray/_private/log_monitor.py > # Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar. > # The change is adapted from https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300 -> -306c310,311 +351c354,355 < next_line = next_line.rstrip("\r\n") --- > if next_line[-1] == "\n": diff --git a/sky/skylet/ray_patches/resource_demand_scheduler.py.patch b/sky/skylet/ray_patches/resource_demand_scheduler.py.patch index 59e2e3642d5..53f709e3534 100644 --- a/sky/skylet/ray_patches/resource_demand_scheduler.py.patch +++ b/sky/skylet/ray_patches/resource_demand_scheduler.py.patch @@ -1,17 +1,17 @@ 0a1,5 -> # From https://github.com/ray-project/ray/blob/releases/1.13.0/python/ray/autoscaler/_private/resource_demand_scheduler.py +> # From https://github.com/ray-project/ray/blob/ray-2.0.0/python/ray/autoscaler/_private/resource_demand_scheduler.py > # Sky patch changes: > # - no new nodes are allowed to be launched launched when the upscaling_speed is 0 > # - comment out "assert not unfulfilled": this seems a buggy assert > -493c498,501 +509c514,517 < if upper_bound > 0: --- > # NOTE(sky): do not autoscale when upsclaing speed is 0. > if self.upscaling_speed == 0: > upper_bound = 0 > if upper_bound >= 0: -630c638 +646c654 < assert not unfulfilled --- > # assert not unfulfilled # NOTE(sky): buggy assert. diff --git a/sky/skylet/ray_patches/worker.py.patch b/sky/skylet/ray_patches/worker.py.patch index 66135729dfa..8f37aa8483b 100644 --- a/sky/skylet/ray_patches/worker.py.patch +++ b/sky/skylet/ray_patches/worker.py.patch @@ -1,9 +1,9 @@ 0a1,4 -> # Adapted from https://github.com/ray-project/ray/blob/ray-1.13.0/python/ray/worker.py +> # Adapted from https://github.com/ray-project/ray/blob/ray-2.0.0/python/ray/worker.py > # Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/9233 > # Tracked in PR https://github.com/ray-project/ray/pull/21977/files. > -1323a1328,1334 +1748a1753,1759 > > def end_for(line: str) -> str: > if sys.platform == "win32": @@ -11,7 +11,7 @@ > if line.endswith("\r"): > return "" > return "\n" -1343a1355 +1768a1780 > end=end_for(line), -1357a1370 +1782a1795 > end=end_for(line),