diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 21956c4e3d9..c4ac6ce7ebb 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -48,7 +48,7 @@ jobs: run: | python -m pip install --upgrade pip pip install ".[all]" - pip install pytest + pip install pytest pytest-xdist pytest-env>=0.6 - name: Run tests with pytest run: SKY_DISABLE_USAGE_COLLECTION=1 pytest ${{ matrix.test-path }} diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 1dc0414d413..a0c15f17c46 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -25,6 +25,7 @@ jobs: run: | python -m pip install --upgrade pip pip install yapf==0.32.0 + pip install toml==0.10.2 - name: Running yapf run: | - yapf --diff --style .style.yapf --recursive ./ --exclude 'sky/skylet/ray_patches/**' --exclude 'sky/skylet/providers/**' + yapf --diff --recursive ./ --exclude 'sky/skylet/ray_patches/**' --exclude 'sky/skylet/providers/**' diff --git a/.style.yapf b/.style.yapf deleted file mode 100644 index 059455612e9..00000000000 --- a/.style.yapf +++ /dev/null @@ -1,3 +0,0 @@ -[style] -based_on_style = google -allow_split_before_dict_value = False \ No newline at end of file diff --git a/README.md b/README.md index ba24418f404..5c41e7af23a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- +
![pytest](https://github.com/skypilot-org/skypilot/actions/workflows/pytest.yml/badge.svg) @@ -7,7 +7,6 @@ SkyPilot is a framework for easily running machine learning[^1] workloads on any cloud through a unified interface. No knowledge of cloud offerings is required or expected – you simply define the workload and its resource requirements, and SkyPilot will automatically execute it on AWS, Google Cloud Platform or Microsoft Azure. -[^1]: SkyPilot is primarily targeted at machine learning workloads, but it can also support many general workloads. We're excited to hear about your use case and would love to hear more about how we can better support your requirements - please join us in [this discussion](https://github.com/skypilot-org/skypilot/discussions/1016)! ### Key features * **Run existing projects on the cloud** with zero code changes @@ -75,3 +74,6 @@ We are excited to hear your feedback! SkyPilot has two channels for engaging wit ## Contributing We welcome and value all contributions to the project! Please refer to the [contribution guide](CONTRIBUTING.md) for more on how to get involved. + + +[^1]: SkyPilot is primarily targeted at machine learning workloads, but it can also support many general workloads. We're excited to hear about your use case and would love to hear more about how we can better support your requirements - please join us in [this discussion](https://github.com/skypilot-org/skypilot/discussions/1016)! diff --git a/examples/managed_spot_with_storage.yaml b/examples/managed_spot_with_storage.yaml index 2dabaa64ace..d82aff7db4b 100644 --- a/examples/managed_spot_with_storage.yaml +++ b/examples/managed_spot_with_storage.yaml @@ -15,7 +15,7 @@ file_mounts: ~/sky_workdir: # Change this to the your own globally unique bucket name. name: sky-workdir-zhwu - source: . + source: ./examples persistent: false mode: COPY /imagenet-image: @@ -23,5 +23,5 @@ file_mounts: run: | set -ex - ls ~/sky_workdir/sky + ls ~/sky_workdir/managed_spot_with_storage.yaml ls -l /imagenet-image/datasets diff --git a/format.sh b/format.sh index 0728c1b2d23..7ed6509c461 100755 --- a/format.sh +++ b/format.sh @@ -38,7 +38,6 @@ tool_version_check "pylint" $PYLINT_VERSION "2.8.2" tool_version_check "pylint-quotes" $PYLINT_QUOTES_VERSION "0.2.3" YAPF_FLAGS=( - '--style' "$ROOT/.style.yapf" '--recursive' '--parallel' ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000000..0adcaa8c1eb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[build-system] +requires = ["setuptools>=58.0"] +build-backend = "setuptools.build_meta" + + +[tool.yapf] +based_on_style = "google" +allow_split_before_dict_value = false + +[tool.pytest.ini_options] +required_plugins = [ + "pytest-xdist", + "pytest-env>=0.6" +] +env = [ + "SKYPILOT_DEBUG=1", + "SKYPILOT_DISABLE_USAGE_COLLECTION=1" +] diff --git a/requirements-dev.txt b/requirements-dev.txt index 510006a737d..a176ffe2c8a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,9 @@ yapf==0.32.0 pylint==2.8.2 # https://github.com/edaniszewski/pylint-quotes pylint-quotes==0.2.3 +toml==0.10.2 # testing pytest pytest-xdist +pytest-env>=0.6 diff --git a/sky/__init__.py b/sky/__init__.py index f78b8333477..451bfe31e10 100644 --- a/sky/__init__.py +++ b/sky/__init__.py @@ -1,6 +1,11 @@ """The SkyPilot package.""" import os +# Replaced with the current commit when building the wheels. +__commit__ = '{{SKYPILOT_COMMIT_SHA}}' +__version__ = '1.0.0-dev0' +__root_dir__ = os.path.dirname(os.path.abspath(__file__)) + # Keep this order to avoid cyclic imports from sky import backends from sky import benchmark @@ -18,8 +23,6 @@ tail_logs, download_logs, job_status, spot_status, spot_cancel, storage_ls, storage_delete) -__root_dir__ = os.path.dirname(os.path.abspath(__file__)) - # Aliases. AWS = clouds.AWS Azure = clouds.Azure @@ -28,6 +31,7 @@ optimize = Optimizer.optimize __all__ = [ + '__version__', 'AWS', 'Azure', 'GCP', diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index bbec447db89..3062290c17d 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -628,6 +628,7 @@ def write_cluster_config(to_provision: 'resources.Resources', # Sky remote utils. 'sky_remote_path': SKY_REMOTE_PATH, 'sky_local_path': str(local_wheel_path), + 'sky_version': common_utils.normalize_version(sky.__version__), # Local IP handling (optional). 'head_ip': None if ip_list is None else ip_list[0], 'worker_ips': None if ip_list is None else ip_list[1:], @@ -983,8 +984,11 @@ def get_node_ips(cluster_yaml: str, raise exceptions.FetchIPError( exceptions.FetchIPError.Reason.WORKER) from e # Retry if the ssh is not ready for the workers yet. - logger.debug('Retrying to get worker ip.') - time.sleep(backoff.current_backoff()) + backoff_time = backoff.current_backoff() + logger.debug('Retrying to get worker ip ' + f'[{retry_cnt}/{worker_ip_max_attempts}] in ' + f'{backoff_time} seconds.') + time.sleep(backoff_time) worker_ips = re.findall(IP_ADDR_REGEX, out) # Ray Autoscaler On-prem Bug: ray-get-worker-ips outputs nothing! # Workaround: List of IPs are shown in Stderr diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index c333f03ba4a..0cd578cf885 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1085,12 +1085,18 @@ def need_ray_up( if returncode == 0: return False - if ('Head node fetch timed out. Failed to create head node.' - in stderr and isinstance(to_provision_cloud, clouds.Azure)): - logger.info( - 'Retrying head node provisioning due to head fetching ' - 'timeout.') - return True + if isinstance(to_provision_cloud, clouds.Azure): + if 'Failed to invoke the Azure CLI' in stderr: + logger.info( + 'Retrying head node provisioning due to Azure CLI ' + 'issues.') + return True + if ('Head node fetch timed out. Failed to create head node.' + in stderr): + logger.info( + 'Retrying head node provisioning due to head fetching ' + 'timeout.') + return True if ('Processing file mounts' in stdout and 'Running setup commands' not in stdout and 'Failed to setup head node.' in stderr): @@ -1527,11 +1533,11 @@ def _provision(self, to_provision_config.num_nodes, to_provision_config.resources) usage_lib.messages.usage.update_cluster_status(prev_cluster_status) - # TODO(suquark): once we have sky on PYPI, we should directly - # install sky from PYPI. + # TODO(suquark): once we have sky on PyPI, we should directly + # install sky from PyPI. with timeline.Event('backend.provision.wheel_build'): - # TODO(suquark): once we have sky on PYPI, we should directly - # install sky from PYPI. + # TODO(suquark): once we have sky on PyPI, we should directly + # install sky from PyPI. local_wheel_path = wheel_utils.build_sky_wheel() backoff = common_utils.Backoff(_RETRY_UNTIL_UP_INIT_GAP_SECONDS) attempt_cnt = 1 diff --git a/sky/backends/wheel_utils.py b/sky/backends/wheel_utils.py index 9edb2f387b6..5e47a178905 100644 --- a/sky/backends/wheel_utils.py +++ b/sky/backends/wheel_utils.py @@ -11,6 +11,7 @@ import sky from sky.backends import backend_utils +from sky.utils import common_utils # Local wheel path is same as the remote path. WHEEL_DIR = pathlib.Path(os.path.expanduser(backend_utils.SKY_REMOTE_PATH)) @@ -37,7 +38,9 @@ def cleanup_wheels_dir(wheel_dir: pathlib.Path, def _get_latest_built_wheel() -> pathlib.Path: try: - latest_wheel = max(WHEEL_DIR.glob(f'{_PACKAGE_WHEEL_NAME}-*.whl'), + latest_wheel = max(WHEEL_DIR.glob( + f'{_PACKAGE_WHEEL_NAME}-' + f'{common_utils.normalize_version(sky.__version__)}-*.whl'), key=os.path.getctime) except ValueError: raise FileNotFoundError('Could not find built Sky wheels.') from None diff --git a/sky/cli.py b/sky/cli.py index 9c4ae12a1d7..dfca4476930 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -73,6 +73,8 @@ logger = sky_logging.init_logger(__name__) +_CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + _CLUSTER_FLAG_HELP = """\ A cluster name. If provided, either reuse an existing cluster with that name or provision a new cluster with that name. Otherwise provision a new cluster with @@ -801,7 +803,8 @@ def get_help(self, ctx): return super().get_help(ctx) -@click.group(cls=_NaturalOrderGroup) +@click.group(cls=_NaturalOrderGroup, context_settings=_CONTEXT_SETTINGS) +@click.version_option(sky.__version__, '--version', '-v', prog_name='skypilot') def cli(): pass diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index e72278362c1..f85712c8740 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -273,7 +273,7 @@ def check_credentials(self) -> Tuple[bool, Optional[str]]: return False, ( 'AWS CLI is not installed properly.' ' Run the following commands under sky folder:' - # TODO(zhwu): after we publish sky to pypi, + # TODO(zhwu): after we publish sky to PyPI, # change this to `pip install sky[aws]` '\n $ pip install .[aws]' '\n Credentials may also need to be set.' + help_str) diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 5154867c44b..7dc88682b21 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -258,7 +258,7 @@ def check_credentials(self) -> Tuple[bool, Optional[str]]: return False, ( 'Azure CLI returned error. Run the following commands ' 'under sky folder:' - # TODO(zhwu): after we publish sky to pypi, change this to + # TODO(zhwu): after we publish sky to PyPI, change this to # `pip install sky[azure]` '\n $ pip install .[azure]' '\n Credentials may also need to be set.' + help_str) diff --git a/sky/core.py b/sky/core.py index 93e1d251397..ce5a19ebbdd 100644 --- a/sky/core.py +++ b/sky/core.py @@ -472,7 +472,7 @@ def spot_status(refresh: bool) -> List[Dict[str, Any]]: returncode, code, 'Failed to fetch managed job statuses', job_table_json + stderr) except exceptions.CommandError as e: - raise RuntimeError(e.message) from e + raise RuntimeError(e.error_msg) from e jobs = spot.load_spot_job_queue(job_table_json) return jobs @@ -522,7 +522,7 @@ def spot_cancel(name: Optional[str] = None, 'Failed to cancel managed spot job', stdout) except exceptions.CommandError as e: - raise RuntimeError(e.message) from e + raise RuntimeError(e.error_msg) from e logger.info(stdout) if 'Multiple jobs found with name' in stdout: diff --git a/sky/setup_files/setup.py b/sky/setup_files/setup.py index 513300593d6..35d37c6165d 100644 --- a/sky/setup_files/setup.py +++ b/sky/setup_files/setup.py @@ -1,14 +1,20 @@ """SkyPilot. -SkyPilot is a tool to run any workload seamlessly across different cloud -providers through a unified interface. No knowledge of cloud offerings is -required or expected – you simply define the workload and its resource -requirements, and SkyPilot will automatically execute it on AWS, Google Cloud -Platform or Microsoft Azure. +SkyPilot is a framework for easily running machine learning* workloads on any cloud +through a unified interface. No knowledge of cloud offerings is required or expected – +you simply define the workload and its resource requirements, and SkyPilot will +automatically execute it on AWS, Google Cloud Platform or Microsoft Azure. + +*: SkyPilot is primarily targeted at machine learning workloads, but it can also +support many general workloads. We're excited to hear about your use case and would +love to hear more about how we can better support your requirements - please join us +in [this discussion](https://github.com/skypilot-org/skypilot/discussions/1016) """ +import io import os import platform +import re import warnings import setuptools @@ -23,8 +29,27 @@ mac_minor = int(mac_minor) if mac_major < 10 or (mac_major == 10 and mac_minor >= 15): warnings.warn( - f"\'Detected MacOS version {mac_version}. MacOS version >=10.15 " - "is required to install ray>=1.9\'") + f'\'Detected MacOS version {mac_version}. MacOS version >=10.15 ' + 'is required to install ray>=1.9\'') + + +def find_version(*filepath): + # Extract version information from filepath + # Adapted from: https://github.com/ray-project/ray/blob/master/python/setup.py + with open(os.path.join(ROOT_DIR, *filepath)) as fp: + version_match = re.search(r'^__version__ = [\'"]([^\'"]*)[\'"]', + fp.read(), re.M) + if version_match: + return version_match.group(1) + raise RuntimeError('Unable to find version string.') + + +def parse_footnote(readme: str) -> str: + """Parse the footnote from the README.md file.""" + readme = readme.replace('', '#') + footnote_re = re.compile(r'\[\^([0-9]+)\]') + return footnote_re.sub(r'[\1]', readme) + install_requires = [ 'wheel', @@ -68,14 +93,29 @@ extras_require['all'] = sum(extras_require.values(), []) +long_description = '' +readme_filepath = 'README.md' +# When sky/backends/wheel_utils.py builds wheels, it will not contain the README. +# Skip the description for that case. +if os.path.exists(readme_filepath): + long_description = io.open(readme_filepath, 'r', encoding='utf-8').read() + long_description = parse_footnote(long_description) + setuptools.setup( # NOTE: this affects the package.whl wheel name. When changing this (if # ever), you must grep for '.whl' and change all corresponding wheel paths # (templates/*.j2 and wheel_utils.py). name='skypilot', - version='0.1.0', + version=find_version('sky', '__init__.py'), packages=setuptools.find_packages(), + author='SkyPilot Team', + license='Apache 2.0', + readme='README.md', + description='SkyPilot: An intercloud broker for the clouds', + long_description=long_description, + long_description_content_type='text/markdown', setup_requires=['wheel'], + requires_python='>=3.6', install_requires=install_requires, extras_require=extras_require, entry_points={ @@ -88,7 +128,15 @@ 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: System :: Distributed Computing', ], - description='SkyPilot', - long_description=__doc__.replace('\n', ' '), + project_urls={ + 'Homepage': 'https://github.com/skypilot-org/skypilot', + 'Issues': 'https://github.com/skypilot-org/skypilot/issues', + 'Discussion': 'https://github.com/skypilot-org/skypilot/discussions', + 'Documentation': 'https://skypilot.readthedocs.io/en/latest/', + }, ) diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index dab50d6b1e6..a509994273d 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -88,7 +88,7 @@ setup_commands: - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; pip3 uninstall skypilot -y &> /dev/null; - pip3 install "$(echo {{sky_remote_path}}/skypilot-*.whl)[aws]"; + pip3 install "$(echo {{sky_remote_path}}/skypilot-{{sky_version}}*.whl)[aws]"; python3 -c "from sky.skylet.ray_patches import patch; patch()"; sudo systemctl stop unattended-upgrades; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index e64eaf6cfc7..1d12b4f831f 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -94,7 +94,7 @@ setup_commands: - (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful; pip3 uninstall skypilot -y &> /dev/null; - pip3 install "$(echo {{sky_remote_path}}/skypilot-*.whl)[azure]"; + pip3 install "$(echo {{sky_remote_path}}/skypilot-{{sky_version}}*.whl)[azure]"; python3 -c "from sky.skylet.ray_patches import patch; patch()"; sudo systemctl stop unattended-upgrades; sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true; diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 30551ae975b..507a77d468c 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -128,7 +128,7 @@ setup_commands: # patch the buggy ray files and enable `-o allow_other` option for `goofys` - (pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app; pip3 uninstall skypilot -y &> /dev/null; - pip3 install "$(echo {{sky_remote_path}}/skypilot-*.whl)[gcp]"; + pip3 install "$(echo {{sky_remote_path}}/skypilot-{{sky_version}}*.whl)[gcp]"; python3 -c "from sky.skylet.ray_patches import patch; patch()"; [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `gcsfuse`; # For TPU VM diff --git a/sky/templates/spot-controller.yaml.j2 b/sky/templates/spot-controller.yaml.j2 index 049c931b2de..8e5bbe9316f 100644 --- a/sky/templates/spot-controller.yaml.j2 +++ b/sky/templates/spot-controller.yaml.j2 @@ -17,11 +17,11 @@ setup: | # Install cli dependencies (pip list | grep boto3 2>&1 > /dev/null && \ pip list | grep google-api-python-client 2>&1 > /dev/null) || \ - pip3 install "$(echo {{sky_remote_path}}/skypilot-*.whl)[aws,gcp]" 2>&1 > /dev/null + pip3 install "$(echo {{sky_remote_path}}/skypilot-{{sky_version}}*.whl)[aws,gcp]" 2>&1 > /dev/null # We do not install azure dependencies for now since our subscription does not support spot instances. # pip list | grep azure-cli 2>&1 > /dev/null || \ - # pip3 install "$(echo {{sky_remote_path}}/skypilot-*.whl)[azure]" 2>&1 > /dev/null + # pip3 install "$(echo {{sky_remote_path}}/skypilot-{{sky_version}}*.whl)[azure]" 2>&1 > /dev/null gcloud --version 2>&1 > /dev/null || \ (wget --quiet https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-382.0.0-linux-x86_64.tar.gz && \ diff --git a/sky/usage/usage_lib.py b/sky/usage/usage_lib.py index c32d1c04d4f..3cff3c7be63 100644 --- a/sky/usage/usage_lib.py +++ b/sky/usage/usage_lib.py @@ -16,6 +16,7 @@ import requests +import sky from sky import sky_logging from sky.usage import constants from sky.utils import common_utils @@ -90,6 +91,7 @@ def __init__(self) -> None: # Message identifier. self.user: str = get_logging_user_hash() self.run_id: str = _get_logging_run_id() + self.sky_version: str = sky.__version__ # Entry self.cmd: str = common_utils.get_pretty_entry_point() diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index ac0d0edeb16..3887df4e4ef 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -183,3 +183,8 @@ def method_with_retries(*args, **kwargs): raise return method_with_retries + + +def normalize_version(version: str) -> str: + """Normalize a version string.""" + return version.replace('-dev', '.dev') diff --git a/tests/run_smoke_tests.sh b/tests/run_smoke_tests.sh index b4b089d166d..7ed933c7c37 100755 --- a/tests/run_smoke_tests.sh +++ b/tests/run_smoke_tests.sh @@ -16,7 +16,6 @@ else test_spec=tests/test_smoke.py::"$test" fi -export SKYPILOT_DISABLE_USAGE_COLLECTION=1 pytest -s -n 16 -q --tb=short --disable-warnings "$test_spec" # To run all tests including the slow ones, add the --runslow flag: diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 01b2a984019..b745c2c1ab1 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -526,8 +526,8 @@ def test_spot(): f'sky spot launch -n {name}-1 examples/managed_spot.yaml -y -d', f'sky spot launch -n {name}-2 examples/managed_spot.yaml -y -d', 'sleep 5', - f'sky spot status | grep {name}-1 | head -n1 | grep STARTING', - f'sky spot status | grep {name}-2 | head -n1 | grep STARTING', + f's=$(sky spot status); printf "$s"; echo; echo; printf "$s" | grep {name}-1 | head -n1 | grep "STARTING\|RUNNING"', + f's=$(sky spot status); printf "$s"; echo; echo; printf "$s" | grep {name}-2 | head -n1 | grep "STARTING\|RUNNING"', f'sky spot cancel -y -n {name}-1', 'sleep 200', f's=$(sky spot status); printf "$s"; echo; echo; printf "$s" | grep {name}-1 | head -n1 | grep CANCELLED',